deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  4. deepeval/cli/main.py +42 -0
  5. deepeval/confident/api.py +1 -0
  6. deepeval/config/logging.py +33 -0
  7. deepeval/config/settings.py +176 -16
  8. deepeval/constants.py +8 -1
  9. deepeval/dataset/dataset.py +2 -11
  10. deepeval/dataset/utils.py +1 -1
  11. deepeval/evaluate/evaluate.py +5 -1
  12. deepeval/evaluate/execute.py +118 -60
  13. deepeval/evaluate/utils.py +20 -116
  14. deepeval/integrations/crewai/__init__.py +6 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/wrapper.py +45 -5
  18. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  19. deepeval/metrics/api.py +281 -0
  20. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  21. deepeval/metrics/bias/bias.py +12 -3
  22. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  23. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  24. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  25. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  26. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  27. deepeval/metrics/conversational_dag/nodes.py +12 -4
  28. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  29. deepeval/metrics/dag/dag.py +12 -0
  30. deepeval/metrics/dag/nodes.py +12 -4
  31. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  32. deepeval/metrics/g_eval/g_eval.py +37 -15
  33. deepeval/metrics/hallucination/hallucination.py +12 -1
  34. deepeval/metrics/indicator.py +8 -2
  35. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  36. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  37. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  38. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  39. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  40. deepeval/metrics/misuse/misuse.py +12 -1
  41. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  43. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  44. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  45. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  47. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  50. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  51. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  52. deepeval/metrics/non_advice/non_advice.py +12 -0
  53. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  54. deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
  55. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  56. deepeval/metrics/role_violation/role_violation.py +12 -0
  57. deepeval/metrics/summarization/summarization.py +12 -1
  58. deepeval/metrics/task_completion/task_completion.py +3 -0
  59. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  60. deepeval/metrics/toxicity/toxicity.py +12 -0
  61. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  62. deepeval/models/llms/grok_model.py +1 -1
  63. deepeval/models/llms/openai_model.py +2 -0
  64. deepeval/models/retry_policy.py +202 -11
  65. deepeval/openai/__init__.py +14 -32
  66. deepeval/openai/extractors.py +24 -34
  67. deepeval/openai/patch.py +256 -161
  68. deepeval/openai/types.py +20 -0
  69. deepeval/openai/utils.py +98 -56
  70. deepeval/prompt/__init__.py +19 -1
  71. deepeval/prompt/api.py +160 -0
  72. deepeval/prompt/prompt.py +244 -62
  73. deepeval/prompt/utils.py +144 -2
  74. deepeval/synthesizer/chunking/context_generator.py +209 -152
  75. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  76. deepeval/synthesizer/synthesizer.py +8 -5
  77. deepeval/test_case/api.py +131 -0
  78. deepeval/test_run/__init__.py +1 -0
  79. deepeval/test_run/hyperparameters.py +47 -8
  80. deepeval/test_run/test_run.py +104 -1
  81. deepeval/tracing/api.py +3 -1
  82. deepeval/tracing/message_types/__init__.py +10 -0
  83. deepeval/tracing/message_types/base.py +6 -0
  84. deepeval/tracing/message_types/messages.py +14 -0
  85. deepeval/tracing/message_types/tools.py +18 -0
  86. deepeval/tracing/otel/exporter.py +0 -6
  87. deepeval/tracing/otel/utils.py +58 -8
  88. deepeval/tracing/trace_context.py +73 -4
  89. deepeval/tracing/trace_test_manager.py +19 -0
  90. deepeval/tracing/tracing.py +52 -4
  91. deepeval/tracing/types.py +16 -0
  92. deepeval/tracing/utils.py +8 -0
  93. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  94. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
  95. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  96. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  97. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,7 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
+ import asyncio
4
+
3
5
  from typing import Optional, List, Tuple, Union, Type
4
6
  from deepeval.metrics import BaseMetric
5
7
  from deepeval.test_case import (
@@ -16,7 +18,7 @@ from deepeval.metrics.utils import (
16
18
  )
17
19
  from deepeval.models import DeepEvalBaseLLM
18
20
  from deepeval.metrics.indicator import metric_progress_indicator
19
- from deepeval.metrics.g_eval.schema import *
21
+ from deepeval.metrics.g_eval import schema as gschema
20
22
  from deepeval.metrics.g_eval.utils import (
21
23
  Rubric,
22
24
  construct_g_eval_params_string,
@@ -29,6 +31,8 @@ from deepeval.metrics.g_eval.utils import (
29
31
  number_evaluation_steps,
30
32
  get_score_range,
31
33
  )
34
+ from deepeval.metrics.api import metric_data_manager
35
+ from deepeval.config.settings import get_settings
32
36
 
33
37
 
34
38
  class GEval(BaseMetric):
@@ -71,6 +75,7 @@ class GEval(BaseMetric):
71
75
  test_case: LLMTestCase,
72
76
  _show_indicator: bool = True,
73
77
  _in_component: bool = False,
78
+ _log_metric_to_confident: bool = True,
74
79
  _additional_context: Optional[str] = None,
75
80
  ) -> float:
76
81
  check_llm_test_case_params(test_case, self.evaluation_params, self)
@@ -81,12 +86,16 @@ class GEval(BaseMetric):
81
86
  ):
82
87
  if self.async_mode:
83
88
  loop = get_or_create_event_loop()
89
+ coro = self.a_measure(
90
+ test_case,
91
+ _show_indicator=False,
92
+ _in_component=_in_component,
93
+ _additional_context=_additional_context,
94
+ )
84
95
  loop.run_until_complete(
85
- self.a_measure(
86
- test_case,
87
- _show_indicator=False,
88
- _in_component=_in_component,
89
- _additional_context=_additional_context,
96
+ asyncio.wait_for(
97
+ coro,
98
+ timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
90
99
  )
91
100
  )
92
101
  else:
@@ -115,6 +124,10 @@ class GEval(BaseMetric):
115
124
  f"Reason: {self.reason}",
116
125
  ],
117
126
  )
127
+ if _log_metric_to_confident:
128
+ metric_data_manager.post_metric_if_enabled(
129
+ self, test_case=test_case
130
+ )
118
131
 
119
132
  return self.score
120
133
 
@@ -123,6 +136,7 @@ class GEval(BaseMetric):
123
136
  test_case: LLMTestCase,
124
137
  _show_indicator: bool = True,
125
138
  _in_component: bool = False,
139
+ _log_metric_to_confident: bool = True,
126
140
  _additional_context: Optional[str] = None,
127
141
  ) -> float:
128
142
  check_llm_test_case_params(test_case, self.evaluation_params, self)
@@ -158,6 +172,10 @@ class GEval(BaseMetric):
158
172
  f"Reason: {self.reason}",
159
173
  ],
160
174
  )
175
+ if _log_metric_to_confident:
176
+ metric_data_manager.post_metric_if_enabled(
177
+ self, test_case=test_case
178
+ )
161
179
  return self.score
162
180
 
163
181
  async def _a_generate_evaluation_steps(self) -> List[str]:
@@ -177,7 +195,9 @@ class GEval(BaseMetric):
177
195
  return data["steps"]
178
196
  else:
179
197
  try:
180
- res: Steps = await self.model.a_generate(prompt, schema=Steps)
198
+ res: gschema.Steps = await self.model.a_generate(
199
+ prompt, schema=gschema.Steps
200
+ )
181
201
  return res.steps
182
202
  except TypeError:
183
203
  res = await self.model.a_generate(prompt)
@@ -201,7 +221,9 @@ class GEval(BaseMetric):
201
221
  return data["steps"]
202
222
  else:
203
223
  try:
204
- res: Steps = self.model.generate(prompt, schema=Steps)
224
+ res: gschema.Steps = self.model.generate(
225
+ prompt, schema=gschema.Steps
226
+ )
205
227
  return res.steps
206
228
  except TypeError:
207
229
  res = self.model.generate(prompt)
@@ -264,7 +286,7 @@ class GEval(BaseMetric):
264
286
  score, res
265
287
  )
266
288
  return weighted_summed_score, reason
267
- except:
289
+ except (KeyError, AttributeError, TypeError, ValueError):
268
290
  return score, reason
269
291
  except (
270
292
  AttributeError
@@ -276,8 +298,8 @@ class GEval(BaseMetric):
276
298
  return data["score"], data["reason"]
277
299
  else:
278
300
  try:
279
- res: ReasonScore = await self.model.a_generate(
280
- prompt, schema=ReasonScore
301
+ res: gschema.ReasonScore = await self.model.a_generate(
302
+ prompt, schema=gschema.ReasonScore
281
303
  )
282
304
  return res.score, res.reason
283
305
  except TypeError:
@@ -338,7 +360,7 @@ class GEval(BaseMetric):
338
360
  score, res
339
361
  )
340
362
  return weighted_summed_score, reason
341
- except:
363
+ except (KeyError, AttributeError, TypeError, ValueError):
342
364
  return score, reason
343
365
  except AttributeError:
344
366
  # This catches the case where a_generate_raw_response doesn't exist.
@@ -349,8 +371,8 @@ class GEval(BaseMetric):
349
371
  return data["score"], data["reason"]
350
372
  else:
351
373
  try:
352
- res: ReasonScore = self.model.generate(
353
- prompt, schema=ReasonScore
374
+ res: gschema.ReasonScore = self.model.generate(
375
+ prompt, schema=gschema.ReasonScore
354
376
  )
355
377
  return res.score, res.reason
356
378
  except TypeError:
@@ -364,7 +386,7 @@ class GEval(BaseMetric):
364
386
  else:
365
387
  try:
366
388
  self.success = self.score >= self.threshold
367
- except:
389
+ except TypeError:
368
390
  self.success = False
369
391
  return self.success
370
392
 
@@ -16,6 +16,7 @@ from deepeval.metrics.hallucination.template import HallucinationTemplate
16
16
  from deepeval.models import DeepEvalBaseLLM
17
17
  from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.hallucination.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
  required_params: List[LLMTestCaseParams] = [
21
22
  LLMTestCaseParams.INPUT,
@@ -51,6 +52,7 @@ class HallucinationMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
 
56
58
  check_llm_test_case_params(test_case, required_params, self)
@@ -66,6 +68,7 @@ class HallucinationMetric(BaseMetric):
66
68
  test_case,
67
69
  _show_indicator=False,
68
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
69
72
  )
70
73
  )
71
74
  else:
@@ -84,6 +87,10 @@ class HallucinationMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class HallucinationMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, required_params, self)
@@ -118,7 +126,10 @@ class HallucinationMetric(BaseMetric):
118
126
  f"Score: {self.score}\nReason: {self.reason}",
119
127
  ],
120
128
  )
121
-
129
+ if _log_metric_to_confident:
130
+ metric_data_manager.post_metric_if_enabled(
131
+ self, test_case=test_case
132
+ )
122
133
  return self.score
123
134
 
124
135
  async def _a_generate_reason(self):
@@ -100,6 +100,7 @@ async def measure_metric_task(
100
100
  test_case,
101
101
  _show_indicator=False,
102
102
  _in_component=_in_component,
103
+ _log_metric_to_confident=False,
103
104
  )
104
105
  finish_text = "Done"
105
106
  except MissingTestCaseParamsError as e:
@@ -116,7 +117,9 @@ async def measure_metric_task(
116
117
  except TypeError:
117
118
  try:
118
119
  await metric.a_measure(
119
- test_case, _in_component=_in_component
120
+ test_case,
121
+ _in_component=_in_component,
122
+ _log_metric_to_confident=False,
120
123
  )
121
124
  finish_text = "Done"
122
125
  except MissingTestCaseParamsError as e:
@@ -241,7 +244,10 @@ async def safe_a_measure(
241
244
  ):
242
245
  try:
243
246
  await metric.a_measure(
244
- tc, _show_indicator=False, _in_component=_in_component
247
+ tc,
248
+ _show_indicator=False,
249
+ _in_component=_in_component,
250
+ _log_metric_to_confident=False,
245
251
  )
246
252
  update_pbar(progress, pbar_eval_id)
247
253
  except MissingTestCaseParamsError as e:
@@ -18,6 +18,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
19
19
  from deepeval.metrics.json_correctness.schema import JsonCorrectnessScoreReason
20
20
  from deepeval.utils import get_or_create_event_loop
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
  DEFAULT_CORRECT_REASON = "The generated Json matches and is syntactically correct to the expected schema."
23
24
 
@@ -51,6 +52,7 @@ class JsonCorrectnessMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
 
56
58
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -66,6 +68,7 @@ class JsonCorrectnessMetric(BaseMetric):
66
68
  test_case,
67
69
  _show_indicator=False,
68
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
69
72
  )
70
73
  )
71
74
  else:
@@ -88,6 +91,10 @@ class JsonCorrectnessMetric(BaseMetric):
88
91
  f"Score: {self.score}\nReason: {self.reason}",
89
92
  ],
90
93
  )
94
+ if _log_metric_to_confident:
95
+ metric_data_manager.post_metric_if_enabled(
96
+ self, test_case=test_case
97
+ )
91
98
 
92
99
  return self.score
93
100
 
@@ -96,6 +103,7 @@ class JsonCorrectnessMetric(BaseMetric):
96
103
  test_case: LLMTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
 
101
109
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -126,7 +134,10 @@ class JsonCorrectnessMetric(BaseMetric):
126
134
  f"Score: {self.score}\nReason: {self.reason}",
127
135
  ],
128
136
  )
129
-
137
+ if _log_metric_to_confident:
138
+ metric_data_manager.post_metric_if_enabled(
139
+ self, test_case=test_case
140
+ )
130
141
  return self.score
131
142
 
132
143
  async def a_generate_reason(self, actual_output: str) -> str:
@@ -20,6 +20,7 @@ from deepeval.metrics.knowledge_retention.schema import (
20
20
  KnowledgeRetentionScoreReason,
21
21
  )
22
22
  from deepeval.utils import get_or_create_event_loop, prettify_list
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class KnowledgeRetentionMetric(BaseConversationalMetric):
@@ -47,6 +48,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
47
48
  test_case: ConversationalTestCase,
48
49
  _show_indicator: bool = True,
49
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
50
52
  ):
51
53
  check_conversational_test_case_params(
52
54
  test_case, self._required_test_case_params, self
@@ -63,6 +65,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
63
65
  test_case,
64
66
  _show_indicator=False,
65
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
66
69
  )
67
70
  )
68
71
  else:
@@ -84,6 +87,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
  return self.score
88
95
 
89
96
  async def a_measure(
@@ -91,6 +98,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
91
98
  test_case: ConversationalTestCase,
92
99
  _show_indicator: bool = True,
93
100
  _in_component: bool = False,
101
+ _log_metric_to_confident: bool = True,
94
102
  ) -> float:
95
103
  check_conversational_test_case_params(
96
104
  test_case, self._required_test_case_params, self
@@ -120,6 +128,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
120
128
  f"Score: {self.score}\nReason: {self.reason}",
121
129
  ],
122
130
  )
131
+ if _log_metric_to_confident:
132
+ metric_data_manager.post_metric_if_enabled(
133
+ self, test_case=test_case
134
+ )
123
135
  return self.score
124
136
 
125
137
  async def _a_generate_reason(self) -> str:
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
16
16
  from deepeval.metrics.mcp.schema import Task, TaskScore
17
17
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
18
18
  from deepeval.errors import MissingTestCaseParamsError
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MCPTaskCompletionMetric(BaseConversationalMetric):
@@ -46,6 +47,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
46
47
  test_case: ConversationalTestCase,
47
48
  _show_indicator: bool = True,
48
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
49
51
  ):
50
52
  check_conversational_test_case_params(
51
53
  test_case, self._required_test_case_params, self
@@ -62,6 +64,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
62
64
  test_case,
63
65
  _show_indicator=False,
64
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
65
68
  )
66
69
  )
67
70
  else:
@@ -90,6 +93,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
90
93
  f"Score: {self.score}",
91
94
  ],
92
95
  )
96
+ if _log_metric_to_confident:
97
+ metric_data_manager.post_metric_if_enabled(
98
+ self, test_case=test_case
99
+ )
93
100
  return self.score
94
101
 
95
102
  async def a_measure(
@@ -97,6 +104,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
97
104
  test_case: ConversationalTestCase,
98
105
  _show_indicator: bool = True,
99
106
  _in_component: bool = False,
107
+ _log_metric_to_confident: bool = True,
100
108
  ):
101
109
  check_conversational_test_case_params(
102
110
  test_case, self._required_test_case_params, self
@@ -131,6 +139,11 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
131
139
  f"Score: {self.score}",
132
140
  ],
133
141
  )
142
+ if _log_metric_to_confident:
143
+ metric_data_manager.post_metric_if_enabled(
144
+ self, test_case=test_case
145
+ )
146
+
134
147
  return self.score
135
148
 
136
149
  def _generate_reason(self, task_scores: List[TaskScore]) -> str:
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
16
16
  from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
17
17
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
18
18
  from deepeval.errors import MissingTestCaseParamsError
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MultiTurnMCPUseMetric(BaseConversationalMetric):
@@ -46,6 +47,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
46
47
  test_case: ConversationalTestCase,
47
48
  _show_indicator: bool = True,
48
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
49
51
  ):
50
52
  check_conversational_test_case_params(
51
53
  test_case, self._required_test_case_params, self
@@ -62,6 +64,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
62
64
  test_case,
63
65
  _show_indicator=False,
64
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
65
68
  )
66
69
  )
67
70
  else:
@@ -102,6 +105,11 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
102
105
  f"Score: {self.score}",
103
106
  ],
104
107
  )
108
+ if _log_metric_to_confident:
109
+ metric_data_manager.post_metric_if_enabled(
110
+ self, test_case=test_case
111
+ )
112
+
105
113
  return self.score
106
114
 
107
115
  async def a_measure(
@@ -109,6 +117,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
109
117
  test_case: ConversationalTestCase,
110
118
  _show_indicator: bool = True,
111
119
  _in_component: bool = False,
120
+ _log_metric_to_confident: bool = True,
112
121
  ):
113
122
  check_conversational_test_case_params(
114
123
  test_case, self._required_test_case_params, self
@@ -161,6 +170,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
161
170
  f"Score: {self.score}",
162
171
  ],
163
172
  )
173
+ if _log_metric_to_confident:
174
+ metric_data_manager.post_metric_if_enabled(
175
+ self, test_case=test_case
176
+ )
164
177
  return self.score
165
178
 
166
179
  def _get_tool_accuracy_score(
@@ -20,6 +20,7 @@ from deepeval.models import DeepEvalBaseLLM
20
20
  from deepeval.metrics.indicator import metric_progress_indicator
21
21
  from .template import MCPUseMetricTemplate
22
22
  from .schema import MCPPrimitivesScore, MCPArgsScore
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class MCPUseMetric(BaseMetric):
@@ -51,6 +52,7 @@ class MCPUseMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
  check_llm_test_case_params(test_case, self._required_params, self)
56
58
 
@@ -65,6 +67,7 @@ class MCPUseMetric(BaseMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -104,6 +107,10 @@ class MCPUseMetric(BaseMetric):
104
107
  self,
105
108
  steps=steps,
106
109
  )
110
+ if _log_metric_to_confident:
111
+ metric_data_manager.post_metric_if_enabled(
112
+ self, test_case=test_case
113
+ )
107
114
 
108
115
  return self.score
109
116
 
@@ -112,6 +119,7 @@ class MCPUseMetric(BaseMetric):
112
119
  test_case: LLMTestCase,
113
120
  _show_indicator: bool = True,
114
121
  _in_component: bool = False,
122
+ _log_metric_to_confident: bool = True,
115
123
  ) -> float:
116
124
  check_llm_test_case_params(test_case, self._required_params, self)
117
125
 
@@ -154,7 +162,10 @@ class MCPUseMetric(BaseMetric):
154
162
  self,
155
163
  steps=steps,
156
164
  )
157
-
165
+ if _log_metric_to_confident:
166
+ metric_data_manager.post_metric_if_enabled(
167
+ self, test_case=test_case
168
+ )
158
169
  return self.score
159
170
 
160
171
  def _get_primitives_used_score(
@@ -16,6 +16,7 @@ from deepeval.metrics.utils import (
16
16
  )
17
17
  from deepeval.metrics.misuse.template import MisuseTemplate
18
18
  from deepeval.metrics.misuse.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MisuseMetric(BaseMetric):
@@ -53,6 +54,7 @@ class MisuseMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
60
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -68,6 +70,7 @@ class MisuseMetric(BaseMetric):
68
70
  test_case,
69
71
  _show_indicator=False,
70
72
  _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
71
74
  )
72
75
  )
73
76
  else:
@@ -86,6 +89,10 @@ class MisuseMetric(BaseMetric):
86
89
  f"Score: {self.score}\nReason: {self.reason}",
87
90
  ],
88
91
  )
92
+ if _log_metric_to_confident:
93
+ metric_data_manager.post_metric_if_enabled(
94
+ self, test_case=test_case
95
+ )
89
96
 
90
97
  return self.score
91
98
 
@@ -94,6 +101,7 @@ class MisuseMetric(BaseMetric):
94
101
  test_case: LLMTestCase,
95
102
  _show_indicator: bool = True,
96
103
  _in_component: bool = False,
104
+ _log_metric_to_confident: bool = True,
97
105
  ) -> float:
98
106
 
99
107
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,7 +130,10 @@ class MisuseMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
125
-
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
126
137
  return self.score
127
138
 
128
139
  async def _a_generate_reason(self) -> str:
@@ -48,6 +48,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
48
48
  test_case: MLLMTestCase,
49
49
  _show_indicator: bool = True,
50
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
51
52
  ) -> float:
52
53
  check_mllm_test_case_params(
53
54
  test_case, self._required_params, None, None, self
@@ -63,6 +64,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -146,6 +148,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
146
148
  test_case: MLLMTestCase,
147
149
  _show_indicator: bool = True,
148
150
  _in_component: bool = False,
151
+ _log_metric_to_confident: bool = True,
149
152
  ) -> float:
150
153
  check_mllm_test_case_params(
151
154
  test_case, self._required_params, None, None, self
@@ -47,6 +47,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
47
47
  test_case: MLLMTestCase,
48
48
  _show_indicator: bool = True,
49
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
50
51
  ) -> float:
51
52
  check_mllm_test_case_params(
52
53
  test_case, self._required_params, 1, 1, self
@@ -63,6 +64,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -108,6 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
108
110
  test_case: MLLMTestCase,
109
111
  _show_indicator: bool = True,
110
112
  _in_component: bool = False,
113
+ _log_metric_to_confident: bool = True,
111
114
  ) -> float:
112
115
  check_mllm_test_case_params(
113
116
  test_case, self._required_params, 1, 1, self
@@ -49,6 +49,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -147,6 +149,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
147
149
  test_case: MLLMTestCase,
148
150
  _show_indicator: bool = True,
149
151
  _in_component: bool = False,
152
+ _log_metric_to_confident: bool = True,
150
153
  ) -> float:
151
154
  check_mllm_test_case_params(
152
155
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -147,6 +149,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
147
149
  test_case: MLLMTestCase,
148
150
  _show_indicator: bool = True,
149
151
  _in_component: bool = False,
152
+ _log_metric_to_confident: bool = True,
150
153
  ) -> float:
151
154
  check_mllm_test_case_params(
152
155
  test_case, self._required_params, None, None, self