deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Type, Union
2
2
 
3
3
  from deepeval.metrics import BaseMetric
4
+ from deepeval.metrics.api import metric_data_manager
4
5
  from deepeval.test_case import (
5
6
  LLMTestCase,
6
7
  LLMTestCaseParams,
@@ -48,8 +49,8 @@ class BiasMetric(BaseMetric):
48
49
  test_case: LLMTestCase,
49
50
  _show_indicator: bool = True,
50
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
51
53
  ) -> float:
52
-
53
54
  check_llm_test_case_params(test_case, self._required_params, self)
54
55
 
55
56
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -63,6 +64,7 @@ class BiasMetric(BaseMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -81,7 +83,10 @@ class BiasMetric(BaseMetric):
81
83
  f"Score: {self.score}\nReason: {self.reason}",
82
84
  ],
83
85
  )
84
-
86
+ if _log_metric_to_confident:
87
+ metric_data_manager.post_metric_if_enabled(
88
+ self, test_case=test_case
89
+ )
85
90
  return self.score
86
91
 
87
92
  async def a_measure(
@@ -89,8 +94,8 @@ class BiasMetric(BaseMetric):
89
94
  test_case: LLMTestCase,
90
95
  _show_indicator: bool = True,
91
96
  _in_component: bool = False,
97
+ _log_metric_to_confident: bool = True,
92
98
  ) -> float:
93
-
94
99
  check_llm_test_case_params(test_case, self._required_params, self)
95
100
 
96
101
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -116,6 +121,10 @@ class BiasMetric(BaseMetric):
116
121
  ],
117
122
  )
118
123
 
124
+ if _log_metric_to_confident:
125
+ metric_data_manager.post_metric_if_enabled(
126
+ self, test_case=test_case
127
+ )
119
128
  return self.score
120
129
 
121
130
  async def _a_generate_reason(self) -> str:
@@ -17,7 +17,8 @@ from deepeval.metrics.contextual_precision.template import (
17
17
  ContextualPrecisionTemplate,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.metrics.contextual_precision.schema import *
20
+ import deepeval.metrics.contextual_precision.schema as cpschema
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
 
23
24
  class ContextualPrecisionMetric(BaseMetric):
@@ -53,8 +54,8 @@ class ContextualPrecisionMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
-
58
59
  check_llm_test_case_params(test_case, self._required_params, self)
59
60
 
60
61
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -68,10 +69,11 @@ class ContextualPrecisionMetric(BaseMetric):
68
69
  test_case,
69
70
  _show_indicator=False,
70
71
  _in_component=_in_component,
72
+ _log_metric_to_confident=_log_metric_to_confident,
71
73
  )
72
74
  )
73
75
  else:
74
- self.verdicts: List[ContextualPrecisionVerdict] = (
76
+ self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
75
77
  self._generate_verdicts(
76
78
  test_case.input,
77
79
  test_case.expected_output,
@@ -88,7 +90,10 @@ class ContextualPrecisionMetric(BaseMetric):
88
90
  f"Score: {self.score}\nReason: {self.reason}",
89
91
  ],
90
92
  )
91
-
93
+ if _log_metric_to_confident:
94
+ metric_data_manager.post_metric_if_enabled(
95
+ self, test_case=test_case
96
+ )
92
97
  return self.score
93
98
 
94
99
  async def a_measure(
@@ -96,6 +101,7 @@ class ContextualPrecisionMetric(BaseMetric):
96
101
  test_case: LLMTestCase,
97
102
  _show_indicator: bool = True,
98
103
  _in_component: bool = False,
104
+ _log_metric_to_confident: bool = True,
99
105
  ) -> float:
100
106
 
101
107
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -107,7 +113,7 @@ class ContextualPrecisionMetric(BaseMetric):
107
113
  _show_indicator=_show_indicator,
108
114
  _in_component=_in_component,
109
115
  ):
110
- self.verdicts: List[ContextualPrecisionVerdict] = (
116
+ self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
111
117
  await self._a_generate_verdicts(
112
118
  test_case.input,
113
119
  test_case.expected_output,
@@ -124,7 +130,10 @@ class ContextualPrecisionMetric(BaseMetric):
124
130
  f"Score: {self.score}\nReason: {self.reason}",
125
131
  ],
126
132
  )
127
-
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
128
137
  return self.score
129
138
 
130
139
  async def _a_generate_reason(self, input: str):
@@ -132,7 +141,7 @@ class ContextualPrecisionMetric(BaseMetric):
132
141
  return None
133
142
 
134
143
  retrieval_contexts_verdicts = [
135
- {"verdict": verdict.verdict, "reasons": verdict.reason}
144
+ {"verdict": verdict.verdict, "reason": verdict.reason}
136
145
  for verdict in self.verdicts
137
146
  ]
138
147
  prompt = self.evaluation_template.generate_reason(
@@ -143,15 +152,15 @@ class ContextualPrecisionMetric(BaseMetric):
143
152
 
144
153
  if self.using_native_model:
145
154
  res, cost = await self.model.a_generate(
146
- prompt, schema=ContextualPrecisionScoreReason
155
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
147
156
  )
148
157
  self.evaluation_cost += cost
149
158
  return res.reason
150
159
  else:
151
160
  try:
152
- res: ContextualPrecisionScoreReason = (
161
+ res: cpschema.ContextualPrecisionScoreReason = (
153
162
  await self.model.a_generate(
154
- prompt, schema=ContextualPrecisionScoreReason
163
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
155
164
  )
156
165
  )
157
166
  return res.reason
@@ -165,7 +174,7 @@ class ContextualPrecisionMetric(BaseMetric):
165
174
  return None
166
175
 
167
176
  retrieval_contexts_verdicts = [
168
- {"verdict": verdict.verdict, "reasons": verdict.reason}
177
+ {"verdict": verdict.verdict, "reason": verdict.reason}
169
178
  for verdict in self.verdicts
170
179
  ]
171
180
  prompt = self.evaluation_template.generate_reason(
@@ -176,14 +185,16 @@ class ContextualPrecisionMetric(BaseMetric):
176
185
 
177
186
  if self.using_native_model:
178
187
  res, cost = self.model.generate(
179
- prompt, schema=ContextualPrecisionScoreReason
188
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
180
189
  )
181
190
  self.evaluation_cost += cost
182
191
  return res.reason
183
192
  else:
184
193
  try:
185
- res: ContextualPrecisionScoreReason = self.model.generate(
186
- prompt, schema=ContextualPrecisionScoreReason
194
+ res: cpschema.ContextualPrecisionScoreReason = (
195
+ self.model.generate(
196
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
197
+ )
187
198
  )
188
199
  return res.reason
189
200
  except TypeError:
@@ -193,21 +204,23 @@ class ContextualPrecisionMetric(BaseMetric):
193
204
 
194
205
  async def _a_generate_verdicts(
195
206
  self, input: str, expected_output: str, retrieval_context: List[str]
196
- ) -> List[ContextualPrecisionVerdict]:
207
+ ) -> List[cpschema.ContextualPrecisionVerdict]:
197
208
  prompt = self.evaluation_template.generate_verdicts(
198
209
  input=input,
199
210
  expected_output=expected_output,
200
211
  retrieval_context=retrieval_context,
201
212
  )
202
213
  if self.using_native_model:
203
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
214
+ res, cost = await self.model.a_generate(
215
+ prompt, schema=cpschema.Verdicts
216
+ )
204
217
  self.evaluation_cost += cost
205
218
  verdicts = [item for item in res.verdicts]
206
219
  return verdicts
207
220
  else:
208
221
  try:
209
- res: Verdicts = await self.model.a_generate(
210
- prompt, schema=Verdicts
222
+ res: cpschema.Verdicts = await self.model.a_generate(
223
+ prompt, schema=cpschema.Verdicts
211
224
  )
212
225
  verdicts = [item for item in res.verdicts]
213
226
  return verdicts
@@ -215,34 +228,36 @@ class ContextualPrecisionMetric(BaseMetric):
215
228
  res = await self.model.a_generate(prompt)
216
229
  data = trimAndLoadJson(res, self)
217
230
  verdicts = [
218
- ContextualPrecisionVerdict(**item)
231
+ cpschema.ContextualPrecisionVerdict(**item)
219
232
  for item in data["verdicts"]
220
233
  ]
221
234
  return verdicts
222
235
 
223
236
  def _generate_verdicts(
224
237
  self, input: str, expected_output: str, retrieval_context: List[str]
225
- ) -> List[ContextualPrecisionVerdict]:
238
+ ) -> List[cpschema.ContextualPrecisionVerdict]:
226
239
  prompt = self.evaluation_template.generate_verdicts(
227
240
  input=input,
228
241
  expected_output=expected_output,
229
242
  retrieval_context=retrieval_context,
230
243
  )
231
244
  if self.using_native_model:
232
- res, cost = self.model.generate(prompt, schema=Verdicts)
245
+ res, cost = self.model.generate(prompt, schema=cpschema.Verdicts)
233
246
  self.evaluation_cost += cost
234
247
  verdicts = [item for item in res.verdicts]
235
248
  return verdicts
236
249
  else:
237
250
  try:
238
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
251
+ res: cpschema.Verdicts = self.model.generate(
252
+ prompt, schema=cpschema.Verdicts
253
+ )
239
254
  verdicts = [item for item in res.verdicts]
240
255
  return verdicts
241
256
  except TypeError:
242
257
  res = self.model.generate(prompt)
243
258
  data = trimAndLoadJson(res, self)
244
259
  verdicts = [
245
- ContextualPrecisionVerdict(**item)
260
+ cpschema.ContextualPrecisionVerdict(**item)
246
261
  for item in data["verdicts"]
247
262
  ]
248
263
  return verdicts
@@ -279,7 +294,7 @@ class ContextualPrecisionMetric(BaseMetric):
279
294
  else:
280
295
  try:
281
296
  self.success = self.score >= self.threshold
282
- except:
297
+ except TypeError:
283
298
  self.success = False
284
299
  return self.success
285
300
 
@@ -16,6 +16,7 @@ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.contextual_recall.template import ContextualRecallTemplate
17
17
  from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.contextual_recall.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class ContextualRecallMetric(BaseMetric):
@@ -52,8 +53,8 @@ class ContextualRecallMetric(BaseMetric):
52
53
  test_case: LLMTestCase,
53
54
  _show_indicator: bool = True,
54
55
  _in_component: bool = False,
56
+ _log_metric_to_confident: bool = True,
55
57
  ) -> float:
56
-
57
58
  check_llm_test_case_params(test_case, self._required_params, self)
58
59
 
59
60
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -67,6 +68,7 @@ class ContextualRecallMetric(BaseMetric):
67
68
  test_case,
68
69
  _show_indicator=False,
69
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
70
72
  )
71
73
  )
72
74
  else:
@@ -85,7 +87,10 @@ class ContextualRecallMetric(BaseMetric):
85
87
  f"Score: {self.score}\nReason: {self.reason}",
86
88
  ],
87
89
  )
88
-
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
89
94
  return self.score
90
95
 
91
96
  async def a_measure(
@@ -93,6 +98,7 @@ class ContextualRecallMetric(BaseMetric):
93
98
  test_case: LLMTestCase,
94
99
  _show_indicator: bool = True,
95
100
  _in_component: bool = False,
101
+ _log_metric_to_confident: bool = True,
96
102
  ) -> float:
97
103
 
98
104
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -121,7 +127,10 @@ class ContextualRecallMetric(BaseMetric):
121
127
  f"Score: {self.score}\nReason: {self.reason}",
122
128
  ],
123
129
  )
124
-
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
125
134
  return self.score
126
135
 
127
136
  async def _a_generate_reason(self, expected_output: str):
@@ -19,6 +19,7 @@ from deepeval.metrics.contextual_relevancy.template import (
19
19
  )
20
20
  from deepeval.metrics.indicator import metric_progress_indicator
21
21
  from deepeval.metrics.contextual_relevancy.schema import *
22
+ from deepeval.metrics.api import metric_data_manager
22
23
 
23
24
 
24
25
  class ContextualRelevancyMetric(BaseMetric):
@@ -53,6 +54,7 @@ class ContextualRelevancyMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
60
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -68,6 +70,7 @@ class ContextualRelevancyMetric(BaseMetric):
68
70
  test_case,
69
71
  _show_indicator=False,
70
72
  _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
71
74
  )
72
75
  )
73
76
  else:
@@ -85,6 +88,10 @@ class ContextualRelevancyMetric(BaseMetric):
85
88
  f"Score: {self.score}\nReason: {self.reason}",
86
89
  ],
87
90
  )
91
+ if _log_metric_to_confident:
92
+ metric_data_manager.post_metric_if_enabled(
93
+ self, test_case=test_case
94
+ )
88
95
 
89
96
  return self.score
90
97
 
@@ -93,6 +100,7 @@ class ContextualRelevancyMetric(BaseMetric):
93
100
  test_case: LLMTestCase,
94
101
  _show_indicator: bool = True,
95
102
  _in_component: bool = False,
103
+ _log_metric_to_confident: bool = True,
96
104
  ) -> float:
97
105
 
98
106
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,7 +130,10 @@ class ContextualRelevancyMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
125
-
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
126
137
  return self.score
127
138
 
128
139
  async def _a_generate_reason(self, input: str):
@@ -19,6 +19,7 @@ from deepeval.test_case import TurnParams
19
19
  from deepeval.test_case.conversational_test_case import Turn
20
20
  from deepeval.utils import get_or_create_event_loop, prettify_list
21
21
  from deepeval.metrics.conversation_completeness.schema import *
22
+ from deepeval.metrics.api import metric_data_manager
22
23
 
23
24
 
24
25
  class ConversationCompletenessMetric(BaseConversationalMetric):
@@ -48,6 +49,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
48
49
  test_case: ConversationalTestCase,
49
50
  _show_indicator: bool = True,
50
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
51
53
  ):
52
54
  check_conversational_test_case_params(
53
55
  test_case, self._required_test_case_params, self
@@ -64,6 +66,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
64
66
  test_case,
65
67
  _show_indicator=False,
66
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
67
70
  )
68
71
  )
69
72
  else:
@@ -89,6 +92,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
89
92
  f"Score: {self.score}\nReason: {self.reason}",
90
93
  ],
91
94
  )
95
+ if _log_metric_to_confident:
96
+ metric_data_manager.post_metric_if_enabled(
97
+ self, test_case=test_case
98
+ )
92
99
  return self.score
93
100
 
94
101
  async def a_measure(
@@ -96,6 +103,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
96
103
  test_case: ConversationalTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
  check_conversational_test_case_params(
101
109
  test_case, self._required_test_case_params, self
@@ -129,6 +137,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
129
137
  f"Score: {self.score}\nReason: {self.reason}",
130
138
  ],
131
139
  )
140
+ if _log_metric_to_confident:
141
+ metric_data_manager.post_metric_if_enabled(
142
+ self, test_case=test_case
143
+ )
132
144
  return self.score
133
145
 
134
146
  async def _a_generate_reason(self) -> str:
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
18
18
  extract_required_params,
19
19
  copy_graph,
20
20
  )
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
 
23
24
  class ConversationalDAGMetric(BaseConversationalMetric):
@@ -59,6 +60,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
59
60
  test_case: ConversationalTestCase,
60
61
  _show_indicator: bool = True,
61
62
  _in_component: bool = False,
63
+ _log_metric_to_confident: bool = True,
62
64
  ) -> float:
63
65
  check_conversational_test_case_params(
64
66
  test_case,
@@ -77,6 +79,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
77
79
  test_case,
78
80
  _show_indicator=False,
79
81
  _in_component=_in_component,
82
+ _log_metric_to_confident=_log_metric_to_confident,
80
83
  )
81
84
  )
82
85
  else:
@@ -89,6 +92,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
89
92
  f"Score: {self.score}\nReason: {self.reason}",
90
93
  ],
91
94
  )
95
+ if _log_metric_to_confident:
96
+ metric_data_manager.post_metric_if_enabled(
97
+ self, test_case=test_case
98
+ )
92
99
  return self.score
93
100
 
94
101
  async def a_measure(
@@ -96,6 +103,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
96
103
  test_case: ConversationalTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
  check_conversational_test_case_params(
101
109
  test_case,
@@ -119,6 +127,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
119
127
  f"Score: {self.score}\nReason: {self.reason}",
120
128
  ],
121
129
  )
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
122
134
  return self.score
123
135
 
124
136
  def is_successful(self) -> bool:
@@ -141,7 +141,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
141
141
  copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
142
142
 
143
143
  copied_convo_g_eval.measure(
144
- test_case=test_case, _show_indicator=False
144
+ test_case=test_case,
145
+ _show_indicator=False,
146
+ _log_metric_to_confident=False,
145
147
  )
146
148
  metric._verbose_steps.append(
147
149
  construct_node_verbose_log(self, depth, copied_convo_g_eval)
@@ -157,7 +159,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
157
159
  copied_metric.verbose_mode = False
158
160
 
159
161
  copied_metric.measure(
160
- test_case=test_case, _show_indicator=False
162
+ test_case=test_case,
163
+ _show_indicator=False,
164
+ _log_metric_to_confident=False,
161
165
  )
162
166
  metric._verbose_steps.append(
163
167
  construct_node_verbose_log(self, depth, copied_metric)
@@ -213,7 +217,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
213
217
  copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
214
218
 
215
219
  await copied_convo_g_eval.a_measure(
216
- test_case=test_case, _show_indicator=False
220
+ test_case=test_case,
221
+ _show_indicator=False,
222
+ _log_metric_to_confident=False,
217
223
  )
218
224
  metric._verbose_steps.append(
219
225
  construct_node_verbose_log(self, depth, copied_convo_g_eval)
@@ -229,7 +235,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
229
235
  copied_metric.verbose_mode = False
230
236
 
231
237
  await copied_metric.a_measure(
232
- test_case=test_case, _show_indicator=False
238
+ test_case=test_case,
239
+ _show_indicator=False,
240
+ _log_metric_to_confident=False,
233
241
  )
234
242
  metric._verbose_steps.append(
235
243
  construct_node_verbose_log(self, depth, copied_metric)
@@ -0,0 +1,3 @@
1
+ from .template import ConversationalGEvalTemplate
2
+
3
+ __all__ = ["ConversationalGEvalTemplate"]