deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  """A slightly modified tailored version of the LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
3
  from openai.types.chat.chat_completion import ChatCompletion
4
- from typing import Optional, List, Tuple, Union, Dict
4
+ from typing import Optional, List, Tuple, Union, Dict, Type
5
5
  import math
6
6
  from deepeval.metrics import BaseConversationalMetric
7
7
  from deepeval.metrics.g_eval.utils import (
@@ -11,7 +11,6 @@ from deepeval.metrics.g_eval.utils import (
11
11
  format_rubrics,
12
12
  )
13
13
  from deepeval.test_case import (
14
- Turn,
15
14
  TurnParams,
16
15
  ConversationalTestCase,
17
16
  )
@@ -28,7 +27,8 @@ from deepeval.metrics.utils import (
28
27
  )
29
28
  from deepeval.models import DeepEvalBaseLLM
30
29
  from deepeval.metrics.indicator import metric_progress_indicator
31
- from deepeval.metrics.conversational_g_eval.schema import *
30
+ import deepeval.metrics.conversational_g_eval.schema as cgschema
31
+ from deepeval.metrics.api import metric_data_manager
32
32
 
33
33
 
34
34
  class ConversationalGEval(BaseConversationalMetric):
@@ -44,6 +44,9 @@ class ConversationalGEval(BaseConversationalMetric):
44
44
  async_mode: bool = True,
45
45
  strict_mode: bool = False,
46
46
  verbose_mode: bool = False,
47
+ evaluation_template: Type[
48
+ ConversationalGEvalTemplate
49
+ ] = ConversationalGEvalTemplate,
47
50
  _include_g_eval_suffix: bool = True,
48
51
  ):
49
52
  if evaluation_params is not None and len(evaluation_params) == 0:
@@ -85,6 +88,7 @@ class ConversationalGEval(BaseConversationalMetric):
85
88
  self.strict_mode = strict_mode
86
89
  self.async_mode = async_mode
87
90
  self.verbose_mode = verbose_mode
91
+ self.evaluation_template = evaluation_template
88
92
  self._include_g_eval_suffix = _include_g_eval_suffix
89
93
 
90
94
  def measure(
@@ -92,6 +96,7 @@ class ConversationalGEval(BaseConversationalMetric):
92
96
  test_case: ConversationalTestCase,
93
97
  _show_indicator: bool = True,
94
98
  _in_component: bool = False,
99
+ _log_metric_to_confident: bool = True,
95
100
  ) -> float:
96
101
  check_conversational_test_case_params(
97
102
  test_case, self.evaluation_params, self
@@ -108,6 +113,7 @@ class ConversationalGEval(BaseConversationalMetric):
108
113
  test_case,
109
114
  _show_indicator=False,
110
115
  _in_component=_in_component,
116
+ _log_metric_to_confident=_log_metric_to_confident,
111
117
  )
112
118
  )
113
119
  else:
@@ -132,6 +138,10 @@ class ConversationalGEval(BaseConversationalMetric):
132
138
  f"Score: {self.score}\nReason: {self.reason}",
133
139
  ],
134
140
  )
141
+ if _log_metric_to_confident:
142
+ metric_data_manager.post_metric_if_enabled(
143
+ self, test_case=test_case
144
+ )
135
145
 
136
146
  return self.score
137
147
 
@@ -140,6 +150,7 @@ class ConversationalGEval(BaseConversationalMetric):
140
150
  test_case: ConversationalTestCase,
141
151
  _show_indicator: bool = True,
142
152
  _in_component: bool = False,
153
+ _log_metric_to_confident: bool = True,
143
154
  ) -> float:
144
155
  check_conversational_test_case_params(
145
156
  test_case, self.evaluation_params, self
@@ -173,6 +184,10 @@ class ConversationalGEval(BaseConversationalMetric):
173
184
  f"Score: {self.score}\nReason: {self.reason}",
174
185
  ],
175
186
  )
187
+ if _log_metric_to_confident:
188
+ metric_data_manager.post_metric_if_enabled(
189
+ self, test_case=test_case
190
+ )
176
191
 
177
192
  return self.score
178
193
 
@@ -183,16 +198,20 @@ class ConversationalGEval(BaseConversationalMetric):
183
198
  g_eval_params_str = construct_conversational_g_eval_turn_params_string(
184
199
  self.evaluation_params
185
200
  )
186
- prompt = ConversationalGEvalTemplate.generate_evaluation_steps(
201
+ prompt = self.evaluation_template.generate_evaluation_steps(
187
202
  criteria=self.criteria, parameters=g_eval_params_str
188
203
  )
189
204
  if self.using_native_model:
190
- res, cost = await self.model.a_generate(prompt, schema=Steps)
205
+ res, cost = await self.model.a_generate(
206
+ prompt, schema=cgschema.Steps
207
+ )
191
208
  self.evaluation_cost += cost
192
209
  return res.steps
193
210
  else:
194
211
  try:
195
- res: Steps = await self.model.a_generate(prompt, schema=Steps)
212
+ res: cgschema.Steps = await self.model.a_generate(
213
+ prompt, schema=cgschema.Steps
214
+ )
196
215
  return res.steps
197
216
  except TypeError:
198
217
  res = await self.model.a_generate(prompt)
@@ -206,16 +225,18 @@ class ConversationalGEval(BaseConversationalMetric):
206
225
  g_eval_params_str = construct_conversational_g_eval_turn_params_string(
207
226
  self.evaluation_params
208
227
  )
209
- prompt = ConversationalGEvalTemplate.generate_evaluation_steps(
228
+ prompt = self.evaluation_template.generate_evaluation_steps(
210
229
  criteria=self.criteria, parameters=g_eval_params_str
211
230
  )
212
231
  if self.using_native_model:
213
- res, cost = self.model.generate(prompt, schema=Steps)
232
+ res, cost = self.model.generate(prompt, schema=cgschema.Steps)
214
233
  self.evaluation_cost += cost
215
234
  return res.steps
216
235
  else:
217
236
  try:
218
- res: Steps = self.model.generate(prompt, schema=Steps)
237
+ res: cgschema.Steps = self.model.generate(
238
+ prompt, schema=cgschema.Steps
239
+ )
219
240
  return res.steps
220
241
  except TypeError:
221
242
  res = self.model.generate(prompt)
@@ -233,7 +254,7 @@ class ConversationalGEval(BaseConversationalMetric):
233
254
  )
234
255
  if not self.strict_mode:
235
256
  rubric_str = format_rubrics(self.rubric) if self.rubric else None
236
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
257
+ prompt = self.evaluation_template.generate_evaluation_results(
237
258
  evaluation_steps=self.number_evaluation_steps(),
238
259
  test_case_content=test_case_content,
239
260
  turns=[
@@ -244,7 +265,7 @@ class ConversationalGEval(BaseConversationalMetric):
244
265
  rubric=rubric_str,
245
266
  )
246
267
  else:
247
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
268
+ prompt = self.evaluation_template.generate_evaluation_results(
248
269
  evaluation_steps=self.number_evaluation_steps(),
249
270
  test_case_content=test_case_content,
250
271
  turns=[
@@ -270,21 +291,21 @@ class ConversationalGEval(BaseConversationalMetric):
270
291
  score, res
271
292
  )
272
293
  return weighted_summed_score, reason
273
- except:
294
+ except (KeyError, AttributeError, TypeError, ValueError):
274
295
  return score, reason
275
296
  except (
276
297
  AttributeError
277
298
  ): # This catches the case where a_generate_raw_response doesn't exist.
278
299
  if self.using_native_model:
279
300
  res, cost = await self.model.a_generate(
280
- prompt, schema=ReasonScore
301
+ prompt, schema=cgschema.ReasonScore
281
302
  )
282
303
  self.evaluation_cost += cost
283
304
  return res.score, res.reason
284
305
  else:
285
306
  try:
286
- res: ReasonScore = await self.model.a_generate(
287
- prompt, schema=ReasonScore
307
+ res: cgschema.ReasonScore = await self.model.a_generate(
308
+ prompt, schema=cgschema.ReasonScore
288
309
  )
289
310
  return res.score, res.reason
290
311
  except TypeError:
@@ -303,7 +324,7 @@ class ConversationalGEval(BaseConversationalMetric):
303
324
  )
304
325
  if not self.strict_mode:
305
326
  rubric_str = format_rubrics(self.rubric) if self.rubric else None
306
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
327
+ prompt = self.evaluation_template.generate_evaluation_results(
307
328
  evaluation_steps=self.number_evaluation_steps(),
308
329
  test_case_content=test_case_content,
309
330
  turns=[
@@ -314,7 +335,7 @@ class ConversationalGEval(BaseConversationalMetric):
314
335
  rubric=rubric_str,
315
336
  )
316
337
  else:
317
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
338
+ prompt = self.evaluation_template.generate_evaluation_results(
318
339
  evaluation_steps=self.number_evaluation_steps(),
319
340
  test_case_content=test_case_content,
320
341
  turns=[
@@ -340,18 +361,20 @@ class ConversationalGEval(BaseConversationalMetric):
340
361
  score, res
341
362
  )
342
363
  return weighted_summed_score, reason
343
- except:
364
+ except (KeyError, AttributeError, TypeError, ValueError):
344
365
  return score, reason
345
366
  except AttributeError:
346
367
  # This catches the case where a_generate_raw_response doesn't exist.
347
368
  if self.using_native_model:
348
- res, cost = self.model.generate(prompt, schema=ReasonScore)
369
+ res, cost = self.model.generate(
370
+ prompt, schema=cgschema.ReasonScore
371
+ )
349
372
  self.evaluation_cost += cost
350
373
  return res.score, res.reason
351
374
  else:
352
375
  try:
353
- res: ReasonScore = self.model.generate(
354
- prompt, schema=ReasonScore
376
+ res: cgschema.ReasonScore = self.model.generate(
377
+ prompt, schema=cgschema.ReasonScore
355
378
  )
356
379
  return res.score, res.reason
357
380
  except TypeError:
@@ -362,49 +385,44 @@ class ConversationalGEval(BaseConversationalMetric):
362
385
  def generate_weighted_summed_score(
363
386
  self, raw_score: int, raw_response: ChatCompletion
364
387
  ) -> Union[int, float]:
365
- try:
366
- generated_logprobs = raw_response.choices[0].logprobs.content
367
- # First, locate the token that we care for logprobs, i.e., the token matching the score
368
- score_logprobs = None
369
- for token_logprobs in generated_logprobs:
370
- if token_logprobs.token == str(raw_score):
371
- score_logprobs = token_logprobs
372
- break
373
- # Then, calculate the score based on the logprobs
374
- token_linear_probability: Dict[int, float] = {}
375
- sum_linear_probability = 0
376
- # Filter out tokens with <1% linear probability, i.e., logprobs < math.log(0.01)
377
- min_logprob = math.log(0.01)
378
- for token_logprob in score_logprobs.top_logprobs:
379
- logprob = token_logprob.logprob
380
-
381
- # Filter out low probability tokens
382
- if logprob < min_logprob:
383
- continue
384
- # Filter out non-decimal token to prevent errors in later int(token) conversion
385
- if not token_logprob.token.isdecimal():
386
- continue
387
-
388
- # Calculate the linear probability
389
- linear_prob = math.exp(logprob)
390
- token_score = int(token_logprob.token)
391
- if token_linear_probability.get(token_score):
392
- token_linear_probability[token_score] += linear_prob
393
- else:
394
- token_linear_probability[token_score] = linear_prob
395
- sum_linear_probability += linear_prob
396
-
397
- sum_of_weighted_scores = 0.0
398
- for score, prob in token_linear_probability.items():
399
- sum_of_weighted_scores += score * prob
400
-
401
- # Scale the sum of linear probability to 1
402
- weighted_summed_score = (
403
- sum_of_weighted_scores / sum_linear_probability
404
- )
405
- return weighted_summed_score
406
- except:
407
- raise
388
+ generated_logprobs = raw_response.choices[0].logprobs.content
389
+ # First, locate the token that we care for logprobs, i.e., the token matching the score
390
+ score_logprobs = None
391
+ for token_logprobs in generated_logprobs:
392
+ if token_logprobs.token == str(raw_score):
393
+ score_logprobs = token_logprobs
394
+ break
395
+ # Then, calculate the score based on the logprobs
396
+ token_linear_probability: Dict[int, float] = {}
397
+ sum_linear_probability = 0
398
+ # Filter out tokens with <1% linear probability, i.e., logprobs < math.log(0.01)
399
+ min_logprob = math.log(0.01)
400
+ for token_logprob in score_logprobs.top_logprobs:
401
+ logprob = token_logprob.logprob
402
+
403
+ # Filter out low probability tokens
404
+ if logprob < min_logprob:
405
+ continue
406
+ # Filter out non-decimal token to prevent errors in later int(token) conversion
407
+ if not token_logprob.token.isdecimal():
408
+ continue
409
+
410
+ # Calculate the linear probability
411
+ linear_prob = math.exp(logprob)
412
+ token_score = int(token_logprob.token)
413
+ if token_linear_probability.get(token_score):
414
+ token_linear_probability[token_score] += linear_prob
415
+ else:
416
+ token_linear_probability[token_score] = linear_prob
417
+ sum_linear_probability += linear_prob
418
+
419
+ sum_of_weighted_scores = 0.0
420
+ for score, prob in token_linear_probability.items():
421
+ sum_of_weighted_scores += score * prob
422
+
423
+ # Scale the sum of linear probability to 1
424
+ weighted_summed_score = sum_of_weighted_scores / sum_linear_probability
425
+ return weighted_summed_score
408
426
 
409
427
  def number_evaluation_steps(self):
410
428
  evaluation_steps = """"""
@@ -417,8 +435,8 @@ class ConversationalGEval(BaseConversationalMetric):
417
435
  self.success = False
418
436
  else:
419
437
  try:
420
- self.score >= self.threshold
421
- except:
438
+ self.success = self.score >= self.threshold
439
+ except TypeError:
422
440
  self.success = False
423
441
  return self.success
424
442
 
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
18
18
  is_valid_dag_from_roots,
19
19
  extract_required_params,
20
20
  )
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
 
23
24
  class DAGMetric(BaseMetric):
@@ -59,6 +60,7 @@ class DAGMetric(BaseMetric):
59
60
  test_case: LLMTestCase,
60
61
  _show_indicator: bool = True,
61
62
  _in_component: bool = False,
63
+ _log_metric_to_confident: bool = True,
62
64
  ) -> float:
63
65
  check_llm_test_case_params(
64
66
  test_case,
@@ -77,6 +79,7 @@ class DAGMetric(BaseMetric):
77
79
  test_case,
78
80
  _show_indicator=False,
79
81
  _in_component=_in_component,
82
+ _log_metric_to_confident=_log_metric_to_confident,
80
83
  )
81
84
  )
82
85
  else:
@@ -89,6 +92,10 @@ class DAGMetric(BaseMetric):
89
92
  f"Score: {self.score}\nReason: {self.reason}",
90
93
  ],
91
94
  )
95
+ if _log_metric_to_confident:
96
+ metric_data_manager.post_metric_if_enabled(
97
+ self, test_case=test_case
98
+ )
92
99
  return self.score
93
100
 
94
101
  async def a_measure(
@@ -96,6 +103,7 @@ class DAGMetric(BaseMetric):
96
103
  test_case: LLMTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
  check_llm_test_case_params(
101
109
  test_case,
@@ -119,6 +127,10 @@ class DAGMetric(BaseMetric):
119
127
  f"Score: {self.score}\nReason: {self.reason}",
120
128
  ],
121
129
  )
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
122
134
  return self.score
123
135
 
124
136
  def is_successful(self) -> bool:
@@ -111,7 +111,9 @@ class VerdictNode(BaseNode):
111
111
  copied_g_eval = GEval(**g_eval_args)
112
112
 
113
113
  copied_g_eval.measure(
114
- test_case=test_case, _show_indicator=False
114
+ test_case=test_case,
115
+ _show_indicator=False,
116
+ _log_metric_to_confident=False,
115
117
  )
116
118
  metric._verbose_steps.append(
117
119
  construct_node_verbose_log(self, depth, copied_g_eval)
@@ -124,7 +126,9 @@ class VerdictNode(BaseNode):
124
126
  copied_metric.verbose_mode = False
125
127
 
126
128
  copied_metric.measure(
127
- test_case=test_case, _show_indicator=False
129
+ test_case=test_case,
130
+ _show_indicator=False,
131
+ _log_metric_to_confident=False,
128
132
  )
129
133
  metric._verbose_steps.append(
130
134
  construct_node_verbose_log(self, depth, copied_metric)
@@ -174,7 +178,9 @@ class VerdictNode(BaseNode):
174
178
  copied_g_eval = GEval(**g_eval_args)
175
179
 
176
180
  await copied_g_eval.a_measure(
177
- test_case=test_case, _show_indicator=False
181
+ test_case=test_case,
182
+ _show_indicator=False,
183
+ _log_metric_to_confident=False,
178
184
  )
179
185
  metric._verbose_steps.append(
180
186
  construct_node_verbose_log(self, depth, copied_g_eval)
@@ -188,7 +194,9 @@ class VerdictNode(BaseNode):
188
194
  copied_metric.verbose_mode = False
189
195
 
190
196
  await copied_metric.a_measure(
191
- test_case=test_case, _show_indicator=False
197
+ test_case=test_case,
198
+ _show_indicator=False,
199
+ _log_metric_to_confident=False,
192
200
  )
193
201
  metric._verbose_steps.append(
194
202
  construct_node_verbose_log(self, depth, copied_metric)
@@ -11,7 +11,7 @@ class TaskNodeOutput(BaseModel):
11
11
 
12
12
 
13
13
  class BinaryJudgementVerdict(BaseModel):
14
- verdict: Literal[True, False]
14
+ verdict: bool
15
15
  reason: str
16
16
 
17
17
 
@@ -60,10 +60,10 @@ class BinaryJudgementTemplate:
60
60
  {text}
61
61
 
62
62
  **
63
- IMPORTANT: Please make sure to only return a json with two keys: `verdict` (true or false), and the 'reason' key providing the reason.
63
+ IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
64
64
  Example JSON:
65
65
  {{
66
- "verdict": true,
66
+ "verdict": True,
67
67
  "reason": "..."
68
68
  }}
69
69
  **
@@ -23,6 +23,7 @@ from deepeval.metrics.faithfulness.schema import (
23
23
  Truths,
24
24
  Claims,
25
25
  )
26
+ from deepeval.metrics.api import metric_data_manager
26
27
 
27
28
 
28
29
  class FaithfulnessMetric(BaseMetric):
@@ -63,6 +64,7 @@ class FaithfulnessMetric(BaseMetric):
63
64
  test_case: LLMTestCase,
64
65
  _show_indicator: bool = True,
65
66
  _in_component: bool = False,
67
+ _log_metric_to_confident: bool = True,
66
68
  ) -> float:
67
69
 
68
70
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -78,6 +80,7 @@ class FaithfulnessMetric(BaseMetric):
78
80
  test_case,
79
81
  _show_indicator=False,
80
82
  _in_component=_in_component,
83
+ _log_metric_to_confident=_log_metric_to_confident,
81
84
  )
82
85
  )
83
86
  else:
@@ -96,6 +99,10 @@ class FaithfulnessMetric(BaseMetric):
96
99
  f"Score: {self.score}\nReason: {self.reason}",
97
100
  ],
98
101
  )
102
+ if _log_metric_to_confident:
103
+ metric_data_manager.post_metric_if_enabled(
104
+ self, test_case=test_case
105
+ )
99
106
 
100
107
  return self.score
101
108
 
@@ -104,6 +111,7 @@ class FaithfulnessMetric(BaseMetric):
104
111
  test_case: LLMTestCase,
105
112
  _show_indicator: bool = True,
106
113
  _in_component: bool = False,
114
+ _log_metric_to_confident: bool = True,
107
115
  ) -> float:
108
116
 
109
117
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -132,7 +140,10 @@ class FaithfulnessMetric(BaseMetric):
132
140
  f"Score: {self.score}\nReason: {self.reason}",
133
141
  ],
134
142
  )
135
-
143
+ if _log_metric_to_confident:
144
+ metric_data_manager.post_metric_if_enabled(
145
+ self, test_case=test_case
146
+ )
136
147
  return self.score
137
148
 
138
149
  async def _a_generate_reason(self) -> str:
@@ -31,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
31
31
  number_evaluation_steps,
32
32
  get_score_range,
33
33
  )
34
+ from deepeval.metrics.api import metric_data_manager
34
35
  from deepeval.config.settings import get_settings
35
36
 
36
37
 
@@ -74,6 +75,7 @@ class GEval(BaseMetric):
74
75
  test_case: LLMTestCase,
75
76
  _show_indicator: bool = True,
76
77
  _in_component: bool = False,
78
+ _log_metric_to_confident: bool = True,
77
79
  _additional_context: Optional[str] = None,
78
80
  ) -> float:
79
81
  check_llm_test_case_params(test_case, self.evaluation_params, self)
@@ -122,6 +124,10 @@ class GEval(BaseMetric):
122
124
  f"Reason: {self.reason}",
123
125
  ],
124
126
  )
127
+ if _log_metric_to_confident:
128
+ metric_data_manager.post_metric_if_enabled(
129
+ self, test_case=test_case
130
+ )
125
131
 
126
132
  return self.score
127
133
 
@@ -130,6 +136,7 @@ class GEval(BaseMetric):
130
136
  test_case: LLMTestCase,
131
137
  _show_indicator: bool = True,
132
138
  _in_component: bool = False,
139
+ _log_metric_to_confident: bool = True,
133
140
  _additional_context: Optional[str] = None,
134
141
  ) -> float:
135
142
  check_llm_test_case_params(test_case, self.evaluation_params, self)
@@ -165,6 +172,10 @@ class GEval(BaseMetric):
165
172
  f"Reason: {self.reason}",
166
173
  ],
167
174
  )
175
+ if _log_metric_to_confident:
176
+ metric_data_manager.post_metric_if_enabled(
177
+ self, test_case=test_case
178
+ )
168
179
  return self.score
169
180
 
170
181
  async def _a_generate_evaluation_steps(self) -> List[str]:
@@ -0,0 +1 @@
1
+ from .goal_accuracy import GoalAccuracyMetric