deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -16,6 +16,7 @@ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.contextual_recall.template import ContextualRecallTemplate
17
17
  from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.contextual_recall.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class ContextualRecallMetric(BaseMetric):
@@ -52,8 +53,8 @@ class ContextualRecallMetric(BaseMetric):
52
53
  test_case: LLMTestCase,
53
54
  _show_indicator: bool = True,
54
55
  _in_component: bool = False,
56
+ _log_metric_to_confident: bool = True,
55
57
  ) -> float:
56
-
57
58
  check_llm_test_case_params(test_case, self._required_params, self)
58
59
 
59
60
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -67,6 +68,7 @@ class ContextualRecallMetric(BaseMetric):
67
68
  test_case,
68
69
  _show_indicator=False,
69
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
70
72
  )
71
73
  )
72
74
  else:
@@ -85,7 +87,10 @@ class ContextualRecallMetric(BaseMetric):
85
87
  f"Score: {self.score}\nReason: {self.reason}",
86
88
  ],
87
89
  )
88
-
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
89
94
  return self.score
90
95
 
91
96
  async def a_measure(
@@ -93,6 +98,7 @@ class ContextualRecallMetric(BaseMetric):
93
98
  test_case: LLMTestCase,
94
99
  _show_indicator: bool = True,
95
100
  _in_component: bool = False,
101
+ _log_metric_to_confident: bool = True,
96
102
  ) -> float:
97
103
 
98
104
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -121,7 +127,10 @@ class ContextualRecallMetric(BaseMetric):
121
127
  f"Score: {self.score}\nReason: {self.reason}",
122
128
  ],
123
129
  )
124
-
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
125
134
  return self.score
126
135
 
127
136
  async def _a_generate_reason(self, expected_output: str):
@@ -19,6 +19,7 @@ from deepeval.metrics.contextual_relevancy.template import (
19
19
  )
20
20
  from deepeval.metrics.indicator import metric_progress_indicator
21
21
  from deepeval.metrics.contextual_relevancy.schema import *
22
+ from deepeval.metrics.api import metric_data_manager
22
23
 
23
24
 
24
25
  class ContextualRelevancyMetric(BaseMetric):
@@ -53,6 +54,7 @@ class ContextualRelevancyMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
60
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -68,6 +70,7 @@ class ContextualRelevancyMetric(BaseMetric):
68
70
  test_case,
69
71
  _show_indicator=False,
70
72
  _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
71
74
  )
72
75
  )
73
76
  else:
@@ -85,6 +88,10 @@ class ContextualRelevancyMetric(BaseMetric):
85
88
  f"Score: {self.score}\nReason: {self.reason}",
86
89
  ],
87
90
  )
91
+ if _log_metric_to_confident:
92
+ metric_data_manager.post_metric_if_enabled(
93
+ self, test_case=test_case
94
+ )
88
95
 
89
96
  return self.score
90
97
 
@@ -93,6 +100,7 @@ class ContextualRelevancyMetric(BaseMetric):
93
100
  test_case: LLMTestCase,
94
101
  _show_indicator: bool = True,
95
102
  _in_component: bool = False,
103
+ _log_metric_to_confident: bool = True,
96
104
  ) -> float:
97
105
 
98
106
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,7 +130,10 @@ class ContextualRelevancyMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
125
-
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
126
137
  return self.score
127
138
 
128
139
  async def _a_generate_reason(self, input: str):
@@ -19,6 +19,7 @@ from deepeval.test_case import TurnParams
19
19
  from deepeval.test_case.conversational_test_case import Turn
20
20
  from deepeval.utils import get_or_create_event_loop, prettify_list
21
21
  from deepeval.metrics.conversation_completeness.schema import *
22
+ from deepeval.metrics.api import metric_data_manager
22
23
 
23
24
 
24
25
  class ConversationCompletenessMetric(BaseConversationalMetric):
@@ -48,6 +49,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
48
49
  test_case: ConversationalTestCase,
49
50
  _show_indicator: bool = True,
50
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
51
53
  ):
52
54
  check_conversational_test_case_params(
53
55
  test_case, self._required_test_case_params, self
@@ -64,6 +66,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
64
66
  test_case,
65
67
  _show_indicator=False,
66
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
67
70
  )
68
71
  )
69
72
  else:
@@ -89,6 +92,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
89
92
  f"Score: {self.score}\nReason: {self.reason}",
90
93
  ],
91
94
  )
95
+ if _log_metric_to_confident:
96
+ metric_data_manager.post_metric_if_enabled(
97
+ self, test_case=test_case
98
+ )
92
99
  return self.score
93
100
 
94
101
  async def a_measure(
@@ -96,6 +103,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
96
103
  test_case: ConversationalTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
  check_conversational_test_case_params(
101
109
  test_case, self._required_test_case_params, self
@@ -129,6 +137,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
129
137
  f"Score: {self.score}\nReason: {self.reason}",
130
138
  ],
131
139
  )
140
+ if _log_metric_to_confident:
141
+ metric_data_manager.post_metric_if_enabled(
142
+ self, test_case=test_case
143
+ )
132
144
  return self.score
133
145
 
134
146
  async def _a_generate_reason(self) -> str:
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
18
18
  extract_required_params,
19
19
  copy_graph,
20
20
  )
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
 
23
24
  class ConversationalDAGMetric(BaseConversationalMetric):
@@ -59,6 +60,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
59
60
  test_case: ConversationalTestCase,
60
61
  _show_indicator: bool = True,
61
62
  _in_component: bool = False,
63
+ _log_metric_to_confident: bool = True,
62
64
  ) -> float:
63
65
  check_conversational_test_case_params(
64
66
  test_case,
@@ -77,6 +79,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
77
79
  test_case,
78
80
  _show_indicator=False,
79
81
  _in_component=_in_component,
82
+ _log_metric_to_confident=_log_metric_to_confident,
80
83
  )
81
84
  )
82
85
  else:
@@ -89,6 +92,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
89
92
  f"Score: {self.score}\nReason: {self.reason}",
90
93
  ],
91
94
  )
95
+ if _log_metric_to_confident:
96
+ metric_data_manager.post_metric_if_enabled(
97
+ self, test_case=test_case
98
+ )
92
99
  return self.score
93
100
 
94
101
  async def a_measure(
@@ -96,6 +103,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
96
103
  test_case: ConversationalTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
  check_conversational_test_case_params(
101
109
  test_case,
@@ -119,6 +127,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
119
127
  f"Score: {self.score}\nReason: {self.reason}",
120
128
  ],
121
129
  )
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
122
134
  return self.score
123
135
 
124
136
  def is_successful(self) -> bool:
@@ -141,7 +141,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
141
141
  copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
142
142
 
143
143
  copied_convo_g_eval.measure(
144
- test_case=test_case, _show_indicator=False
144
+ test_case=test_case,
145
+ _show_indicator=False,
146
+ _log_metric_to_confident=False,
145
147
  )
146
148
  metric._verbose_steps.append(
147
149
  construct_node_verbose_log(self, depth, copied_convo_g_eval)
@@ -157,7 +159,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
157
159
  copied_metric.verbose_mode = False
158
160
 
159
161
  copied_metric.measure(
160
- test_case=test_case, _show_indicator=False
162
+ test_case=test_case,
163
+ _show_indicator=False,
164
+ _log_metric_to_confident=False,
161
165
  )
162
166
  metric._verbose_steps.append(
163
167
  construct_node_verbose_log(self, depth, copied_metric)
@@ -213,7 +217,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
213
217
  copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
214
218
 
215
219
  await copied_convo_g_eval.a_measure(
216
- test_case=test_case, _show_indicator=False
220
+ test_case=test_case,
221
+ _show_indicator=False,
222
+ _log_metric_to_confident=False,
217
223
  )
218
224
  metric._verbose_steps.append(
219
225
  construct_node_verbose_log(self, depth, copied_convo_g_eval)
@@ -229,7 +235,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
229
235
  copied_metric.verbose_mode = False
230
236
 
231
237
  await copied_metric.a_measure(
232
- test_case=test_case, _show_indicator=False
238
+ test_case=test_case,
239
+ _show_indicator=False,
240
+ _log_metric_to_confident=False,
233
241
  )
234
242
  metric._verbose_steps.append(
235
243
  construct_node_verbose_log(self, depth, copied_metric)
@@ -11,7 +11,6 @@ from deepeval.metrics.g_eval.utils import (
11
11
  format_rubrics,
12
12
  )
13
13
  from deepeval.test_case import (
14
- Turn,
15
14
  TurnParams,
16
15
  ConversationalTestCase,
17
16
  )
@@ -28,7 +27,8 @@ from deepeval.metrics.utils import (
28
27
  )
29
28
  from deepeval.models import DeepEvalBaseLLM
30
29
  from deepeval.metrics.indicator import metric_progress_indicator
31
- from deepeval.metrics.conversational_g_eval.schema import *
30
+ import deepeval.metrics.conversational_g_eval.schema as cgschema
31
+ from deepeval.metrics.api import metric_data_manager
32
32
 
33
33
 
34
34
  class ConversationalGEval(BaseConversationalMetric):
@@ -92,6 +92,7 @@ class ConversationalGEval(BaseConversationalMetric):
92
92
  test_case: ConversationalTestCase,
93
93
  _show_indicator: bool = True,
94
94
  _in_component: bool = False,
95
+ _log_metric_to_confident: bool = True,
95
96
  ) -> float:
96
97
  check_conversational_test_case_params(
97
98
  test_case, self.evaluation_params, self
@@ -108,6 +109,7 @@ class ConversationalGEval(BaseConversationalMetric):
108
109
  test_case,
109
110
  _show_indicator=False,
110
111
  _in_component=_in_component,
112
+ _log_metric_to_confident=_log_metric_to_confident,
111
113
  )
112
114
  )
113
115
  else:
@@ -132,6 +134,10 @@ class ConversationalGEval(BaseConversationalMetric):
132
134
  f"Score: {self.score}\nReason: {self.reason}",
133
135
  ],
134
136
  )
137
+ if _log_metric_to_confident:
138
+ metric_data_manager.post_metric_if_enabled(
139
+ self, test_case=test_case
140
+ )
135
141
 
136
142
  return self.score
137
143
 
@@ -140,6 +146,7 @@ class ConversationalGEval(BaseConversationalMetric):
140
146
  test_case: ConversationalTestCase,
141
147
  _show_indicator: bool = True,
142
148
  _in_component: bool = False,
149
+ _log_metric_to_confident: bool = True,
143
150
  ) -> float:
144
151
  check_conversational_test_case_params(
145
152
  test_case, self.evaluation_params, self
@@ -173,6 +180,10 @@ class ConversationalGEval(BaseConversationalMetric):
173
180
  f"Score: {self.score}\nReason: {self.reason}",
174
181
  ],
175
182
  )
183
+ if _log_metric_to_confident:
184
+ metric_data_manager.post_metric_if_enabled(
185
+ self, test_case=test_case
186
+ )
176
187
 
177
188
  return self.score
178
189
 
@@ -187,12 +198,16 @@ class ConversationalGEval(BaseConversationalMetric):
187
198
  criteria=self.criteria, parameters=g_eval_params_str
188
199
  )
189
200
  if self.using_native_model:
190
- res, cost = await self.model.a_generate(prompt, schema=Steps)
201
+ res, cost = await self.model.a_generate(
202
+ prompt, schema=cgschema.Steps
203
+ )
191
204
  self.evaluation_cost += cost
192
205
  return res.steps
193
206
  else:
194
207
  try:
195
- res: Steps = await self.model.a_generate(prompt, schema=Steps)
208
+ res: cgschema.Steps = await self.model.a_generate(
209
+ prompt, schema=cgschema.Steps
210
+ )
196
211
  return res.steps
197
212
  except TypeError:
198
213
  res = await self.model.a_generate(prompt)
@@ -210,12 +225,14 @@ class ConversationalGEval(BaseConversationalMetric):
210
225
  criteria=self.criteria, parameters=g_eval_params_str
211
226
  )
212
227
  if self.using_native_model:
213
- res, cost = self.model.generate(prompt, schema=Steps)
228
+ res, cost = self.model.generate(prompt, schema=cgschema.Steps)
214
229
  self.evaluation_cost += cost
215
230
  return res.steps
216
231
  else:
217
232
  try:
218
- res: Steps = self.model.generate(prompt, schema=Steps)
233
+ res: cgschema.Steps = self.model.generate(
234
+ prompt, schema=cgschema.Steps
235
+ )
219
236
  return res.steps
220
237
  except TypeError:
221
238
  res = self.model.generate(prompt)
@@ -270,21 +287,21 @@ class ConversationalGEval(BaseConversationalMetric):
270
287
  score, res
271
288
  )
272
289
  return weighted_summed_score, reason
273
- except:
290
+ except (KeyError, AttributeError, TypeError, ValueError):
274
291
  return score, reason
275
292
  except (
276
293
  AttributeError
277
294
  ): # This catches the case where a_generate_raw_response doesn't exist.
278
295
  if self.using_native_model:
279
296
  res, cost = await self.model.a_generate(
280
- prompt, schema=ReasonScore
297
+ prompt, schema=cgschema.ReasonScore
281
298
  )
282
299
  self.evaluation_cost += cost
283
300
  return res.score, res.reason
284
301
  else:
285
302
  try:
286
- res: ReasonScore = await self.model.a_generate(
287
- prompt, schema=ReasonScore
303
+ res: cgschema.ReasonScore = await self.model.a_generate(
304
+ prompt, schema=cgschema.ReasonScore
288
305
  )
289
306
  return res.score, res.reason
290
307
  except TypeError:
@@ -340,18 +357,20 @@ class ConversationalGEval(BaseConversationalMetric):
340
357
  score, res
341
358
  )
342
359
  return weighted_summed_score, reason
343
- except:
360
+ except (KeyError, AttributeError, TypeError, ValueError):
344
361
  return score, reason
345
362
  except AttributeError:
346
363
  # This catches the case where a_generate_raw_response doesn't exist.
347
364
  if self.using_native_model:
348
- res, cost = self.model.generate(prompt, schema=ReasonScore)
365
+ res, cost = self.model.generate(
366
+ prompt, schema=cgschema.ReasonScore
367
+ )
349
368
  self.evaluation_cost += cost
350
369
  return res.score, res.reason
351
370
  else:
352
371
  try:
353
- res: ReasonScore = self.model.generate(
354
- prompt, schema=ReasonScore
372
+ res: cgschema.ReasonScore = self.model.generate(
373
+ prompt, schema=cgschema.ReasonScore
355
374
  )
356
375
  return res.score, res.reason
357
376
  except TypeError:
@@ -362,49 +381,44 @@ class ConversationalGEval(BaseConversationalMetric):
362
381
  def generate_weighted_summed_score(
363
382
  self, raw_score: int, raw_response: ChatCompletion
364
383
  ) -> Union[int, float]:
365
- try:
366
- generated_logprobs = raw_response.choices[0].logprobs.content
367
- # First, locate the token that we care for logprobs, i.e., the token matching the score
368
- score_logprobs = None
369
- for token_logprobs in generated_logprobs:
370
- if token_logprobs.token == str(raw_score):
371
- score_logprobs = token_logprobs
372
- break
373
- # Then, calculate the score based on the logprobs
374
- token_linear_probability: Dict[int, float] = {}
375
- sum_linear_probability = 0
376
- # Filter out tokens with <1% linear probability, i.e., logprobs < math.log(0.01)
377
- min_logprob = math.log(0.01)
378
- for token_logprob in score_logprobs.top_logprobs:
379
- logprob = token_logprob.logprob
380
-
381
- # Filter out low probability tokens
382
- if logprob < min_logprob:
383
- continue
384
- # Filter out non-decimal token to prevent errors in later int(token) conversion
385
- if not token_logprob.token.isdecimal():
386
- continue
387
-
388
- # Calculate the linear probability
389
- linear_prob = math.exp(logprob)
390
- token_score = int(token_logprob.token)
391
- if token_linear_probability.get(token_score):
392
- token_linear_probability[token_score] += linear_prob
393
- else:
394
- token_linear_probability[token_score] = linear_prob
395
- sum_linear_probability += linear_prob
396
-
397
- sum_of_weighted_scores = 0.0
398
- for score, prob in token_linear_probability.items():
399
- sum_of_weighted_scores += score * prob
400
-
401
- # Scale the sum of linear probability to 1
402
- weighted_summed_score = (
403
- sum_of_weighted_scores / sum_linear_probability
404
- )
405
- return weighted_summed_score
406
- except:
407
- raise
384
+ generated_logprobs = raw_response.choices[0].logprobs.content
385
+ # First, locate the token that we care for logprobs, i.e., the token matching the score
386
+ score_logprobs = None
387
+ for token_logprobs in generated_logprobs:
388
+ if token_logprobs.token == str(raw_score):
389
+ score_logprobs = token_logprobs
390
+ break
391
+ # Then, calculate the score based on the logprobs
392
+ token_linear_probability: Dict[int, float] = {}
393
+ sum_linear_probability = 0
394
+ # Filter out tokens with <1% linear probability, i.e., logprobs < math.log(0.01)
395
+ min_logprob = math.log(0.01)
396
+ for token_logprob in score_logprobs.top_logprobs:
397
+ logprob = token_logprob.logprob
398
+
399
+ # Filter out low probability tokens
400
+ if logprob < min_logprob:
401
+ continue
402
+ # Filter out non-decimal token to prevent errors in later int(token) conversion
403
+ if not token_logprob.token.isdecimal():
404
+ continue
405
+
406
+ # Calculate the linear probability
407
+ linear_prob = math.exp(logprob)
408
+ token_score = int(token_logprob.token)
409
+ if token_linear_probability.get(token_score):
410
+ token_linear_probability[token_score] += linear_prob
411
+ else:
412
+ token_linear_probability[token_score] = linear_prob
413
+ sum_linear_probability += linear_prob
414
+
415
+ sum_of_weighted_scores = 0.0
416
+ for score, prob in token_linear_probability.items():
417
+ sum_of_weighted_scores += score * prob
418
+
419
+ # Scale the sum of linear probability to 1
420
+ weighted_summed_score = sum_of_weighted_scores / sum_linear_probability
421
+ return weighted_summed_score
408
422
 
409
423
  def number_evaluation_steps(self):
410
424
  evaluation_steps = """"""
@@ -417,8 +431,8 @@ class ConversationalGEval(BaseConversationalMetric):
417
431
  self.success = False
418
432
  else:
419
433
  try:
420
- self.score >= self.threshold
421
- except:
434
+ self.success = self.score >= self.threshold
435
+ except TypeError:
422
436
  self.success = False
423
437
  return self.success
424
438
 
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
18
18
  is_valid_dag_from_roots,
19
19
  extract_required_params,
20
20
  )
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
 
23
24
  class DAGMetric(BaseMetric):
@@ -59,6 +60,7 @@ class DAGMetric(BaseMetric):
59
60
  test_case: LLMTestCase,
60
61
  _show_indicator: bool = True,
61
62
  _in_component: bool = False,
63
+ _log_metric_to_confident: bool = True,
62
64
  ) -> float:
63
65
  check_llm_test_case_params(
64
66
  test_case,
@@ -77,6 +79,7 @@ class DAGMetric(BaseMetric):
77
79
  test_case,
78
80
  _show_indicator=False,
79
81
  _in_component=_in_component,
82
+ _log_metric_to_confident=_log_metric_to_confident,
80
83
  )
81
84
  )
82
85
  else:
@@ -89,6 +92,10 @@ class DAGMetric(BaseMetric):
89
92
  f"Score: {self.score}\nReason: {self.reason}",
90
93
  ],
91
94
  )
95
+ if _log_metric_to_confident:
96
+ metric_data_manager.post_metric_if_enabled(
97
+ self, test_case=test_case
98
+ )
92
99
  return self.score
93
100
 
94
101
  async def a_measure(
@@ -96,6 +103,7 @@ class DAGMetric(BaseMetric):
96
103
  test_case: LLMTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
  check_llm_test_case_params(
101
109
  test_case,
@@ -119,6 +127,10 @@ class DAGMetric(BaseMetric):
119
127
  f"Score: {self.score}\nReason: {self.reason}",
120
128
  ],
121
129
  )
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
122
134
  return self.score
123
135
 
124
136
  def is_successful(self) -> bool:
@@ -111,7 +111,9 @@ class VerdictNode(BaseNode):
111
111
  copied_g_eval = GEval(**g_eval_args)
112
112
 
113
113
  copied_g_eval.measure(
114
- test_case=test_case, _show_indicator=False
114
+ test_case=test_case,
115
+ _show_indicator=False,
116
+ _log_metric_to_confident=False,
115
117
  )
116
118
  metric._verbose_steps.append(
117
119
  construct_node_verbose_log(self, depth, copied_g_eval)
@@ -124,7 +126,9 @@ class VerdictNode(BaseNode):
124
126
  copied_metric.verbose_mode = False
125
127
 
126
128
  copied_metric.measure(
127
- test_case=test_case, _show_indicator=False
129
+ test_case=test_case,
130
+ _show_indicator=False,
131
+ _log_metric_to_confident=False,
128
132
  )
129
133
  metric._verbose_steps.append(
130
134
  construct_node_verbose_log(self, depth, copied_metric)
@@ -174,7 +178,9 @@ class VerdictNode(BaseNode):
174
178
  copied_g_eval = GEval(**g_eval_args)
175
179
 
176
180
  await copied_g_eval.a_measure(
177
- test_case=test_case, _show_indicator=False
181
+ test_case=test_case,
182
+ _show_indicator=False,
183
+ _log_metric_to_confident=False,
178
184
  )
179
185
  metric._verbose_steps.append(
180
186
  construct_node_verbose_log(self, depth, copied_g_eval)
@@ -188,7 +194,9 @@ class VerdictNode(BaseNode):
188
194
  copied_metric.verbose_mode = False
189
195
 
190
196
  await copied_metric.a_measure(
191
- test_case=test_case, _show_indicator=False
197
+ test_case=test_case,
198
+ _show_indicator=False,
199
+ _log_metric_to_confident=False,
192
200
  )
193
201
  metric._verbose_steps.append(
194
202
  construct_node_verbose_log(self, depth, copied_metric)
@@ -23,6 +23,7 @@ from deepeval.metrics.faithfulness.schema import (
23
23
  Truths,
24
24
  Claims,
25
25
  )
26
+ from deepeval.metrics.api import metric_data_manager
26
27
 
27
28
 
28
29
  class FaithfulnessMetric(BaseMetric):
@@ -63,6 +64,7 @@ class FaithfulnessMetric(BaseMetric):
63
64
  test_case: LLMTestCase,
64
65
  _show_indicator: bool = True,
65
66
  _in_component: bool = False,
67
+ _log_metric_to_confident: bool = True,
66
68
  ) -> float:
67
69
 
68
70
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -78,6 +80,7 @@ class FaithfulnessMetric(BaseMetric):
78
80
  test_case,
79
81
  _show_indicator=False,
80
82
  _in_component=_in_component,
83
+ _log_metric_to_confident=_log_metric_to_confident,
81
84
  )
82
85
  )
83
86
  else:
@@ -96,6 +99,10 @@ class FaithfulnessMetric(BaseMetric):
96
99
  f"Score: {self.score}\nReason: {self.reason}",
97
100
  ],
98
101
  )
102
+ if _log_metric_to_confident:
103
+ metric_data_manager.post_metric_if_enabled(
104
+ self, test_case=test_case
105
+ )
99
106
 
100
107
  return self.score
101
108
 
@@ -104,6 +111,7 @@ class FaithfulnessMetric(BaseMetric):
104
111
  test_case: LLMTestCase,
105
112
  _show_indicator: bool = True,
106
113
  _in_component: bool = False,
114
+ _log_metric_to_confident: bool = True,
107
115
  ) -> float:
108
116
 
109
117
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -132,7 +140,10 @@ class FaithfulnessMetric(BaseMetric):
132
140
  f"Score: {self.score}\nReason: {self.reason}",
133
141
  ],
134
142
  )
135
-
143
+ if _log_metric_to_confident:
144
+ metric_data_manager.post_metric_if_enabled(
145
+ self, test_case=test_case
146
+ )
136
147
  return self.score
137
148
 
138
149
  async def _a_generate_reason(self) -> str: