deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -53,6 +53,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
53
53
  test_case: MLLMTestCase,
54
54
  _show_indicator: bool = True,
55
55
  _in_component: bool = False,
56
+ _log_metric_to_confident: bool = True,
56
57
  ) -> float:
57
58
  check_mllm_test_case_params(
58
59
  test_case, self._required_params, None, None, self
@@ -71,6 +72,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
71
72
  test_case,
72
73
  _show_indicator=False,
73
74
  _in_component=_in_component,
75
+ _log_metric_to_confident=_log_metric_to_confident,
74
76
  )
75
77
  )
76
78
  else:
@@ -97,6 +99,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
97
99
  test_case: MLLMTestCase,
98
100
  _show_indicator: bool = True,
99
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
100
103
  ) -> float:
101
104
  check_mllm_test_case_params(
102
105
  test_case, self._required_params, None, None, self
@@ -78,6 +78,7 @@ class MultimodalGEval(BaseMultimodalMetric):
78
78
  test_case: MLLMTestCase,
79
79
  _show_indicator: bool = True,
80
80
  _in_component: bool = False,
81
+ _log_metric_to_confident: bool = True,
81
82
  _additional_context: Optional[str] = None,
82
83
  ) -> float:
83
84
 
@@ -96,6 +97,7 @@ class MultimodalGEval(BaseMultimodalMetric):
96
97
  test_case,
97
98
  _show_indicator=False,
98
99
  _in_component=_in_component,
100
+ _log_metric_to_confident=_log_metric_to_confident,
99
101
  _additional_context=_additional_context,
100
102
  )
101
103
  )
@@ -132,6 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
132
134
  _show_indicator: bool = True,
133
135
  _in_component: bool = False,
134
136
  _additional_context: Optional[str] = None,
137
+ _log_metric_to_confident: bool = True,
135
138
  ) -> float:
136
139
 
137
140
  check_mllm_test_case_params(
@@ -3,7 +3,7 @@ from typing import List, Dict
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- check_llm_test_case_params,
6
+ check_mllm_test_case_params,
7
7
  )
8
8
  from deepeval.test_case import (
9
9
  MLLMTestCase,
@@ -11,10 +11,10 @@ from deepeval.test_case import (
11
11
  ToolCallParams,
12
12
  ToolCall,
13
13
  )
14
- from deepeval.metrics import BaseMetric
14
+ from deepeval.metrics import BaseMultimodalMetric
15
15
 
16
16
 
17
- class MultimodalToolCorrectnessMetric(BaseMetric):
17
+ class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
18
18
 
19
19
  _required_params: List[MLLMTestCaseParams] = [
20
20
  MLLMTestCaseParams.INPUT,
@@ -46,8 +46,11 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
46
46
  test_case: MLLMTestCase,
47
47
  _show_indicator: bool = True,
48
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
49
50
  ) -> float:
50
- check_llm_test_case_params(test_case, self._required_params, self)
51
+ check_mllm_test_case_params(
52
+ test_case, self._required_params, None, None, self
53
+ )
51
54
  self.test_case = test_case
52
55
  with metric_progress_indicator(
53
56
  self, _show_indicator=_show_indicator, _in_component=_in_component
@@ -90,11 +93,13 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
90
93
  test_case: MLLMTestCase,
91
94
  _show_indicator: bool = True,
92
95
  _in_component: bool = False,
96
+ _log_metric_to_confident: bool = True,
93
97
  ) -> float:
94
98
  return self.measure(
95
99
  test_case,
96
100
  _show_indicator=_show_indicator,
97
101
  _in_component=_in_component,
102
+ _log_metric_to_confident=_log_metric_to_confident,
98
103
  )
99
104
 
100
105
  ##################################################
@@ -278,7 +283,7 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
278
283
 
279
284
  @property
280
285
  def __name__(self):
281
- return "Tool Correctness"
286
+ return "Multi Modal Tool Correctness"
282
287
 
283
288
  def indent_multiline_string(self, s, indent_level=4):
284
289
  indent = " " * indent_level
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.non_advice.template import NonAdviceTemplate
19
19
  from deepeval.metrics.non_advice.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class NonAdviceMetric(BaseMetric):
@@ -58,6 +59,7 @@ class NonAdviceMetric(BaseMetric):
58
59
  test_case: LLMTestCase,
59
60
  _show_indicator: bool = True,
60
61
  _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
61
63
  ) -> float:
62
64
 
63
65
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -73,6 +75,7 @@ class NonAdviceMetric(BaseMetric):
73
75
  test_case,
74
76
  _show_indicator=False,
75
77
  _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
76
79
  )
77
80
  )
78
81
  else:
@@ -93,6 +96,10 @@ class NonAdviceMetric(BaseMetric):
93
96
  f"Score: {self.score}\nReason: {self.reason}",
94
97
  ],
95
98
  )
99
+ if _log_metric_to_confident:
100
+ metric_data_manager.post_metric_if_enabled(
101
+ self, test_case=test_case
102
+ )
96
103
 
97
104
  return self.score
98
105
 
@@ -101,6 +108,7 @@ class NonAdviceMetric(BaseMetric):
101
108
  test_case: LLMTestCase,
102
109
  _show_indicator: bool = True,
103
110
  _in_component: bool = False,
111
+ _log_metric_to_confident: bool = True,
104
112
  ) -> float:
105
113
 
106
114
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -129,6 +137,10 @@ class NonAdviceMetric(BaseMetric):
129
137
  f"Score: {self.score}\nReason: {self.reason}",
130
138
  ],
131
139
  )
140
+ if _log_metric_to_confident:
141
+ metric_data_manager.post_metric_if_enabled(
142
+ self, test_case=test_case
143
+ )
132
144
 
133
145
  return self.score
134
146
 
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.pii_leakage.template import PIILeakageTemplate
19
19
  from deepeval.metrics.pii_leakage.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class PIILeakageMetric(BaseMetric):
@@ -49,6 +50,7 @@ class PIILeakageMetric(BaseMetric):
49
50
  test_case: LLMTestCase,
50
51
  _show_indicator: bool = True,
51
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
52
54
  ) -> float:
53
55
 
54
56
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -64,6 +66,7 @@ class PIILeakageMetric(BaseMetric):
64
66
  test_case,
65
67
  _show_indicator=False,
66
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
67
70
  )
68
71
  )
69
72
  else:
@@ -84,6 +87,10 @@ class PIILeakageMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class PIILeakageMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -120,7 +128,10 @@ class PIILeakageMetric(BaseMetric):
120
128
  f"Score: {self.score}\nReason: {self.reason}",
121
129
  ],
122
130
  )
123
-
131
+ if _log_metric_to_confident:
132
+ metric_data_manager.post_metric_if_enabled(
133
+ self, test_case=test_case
134
+ )
124
135
  return self.score
125
136
 
126
137
  async def _a_generate_reason(self) -> str:
@@ -20,6 +20,8 @@ from deepeval.metrics.indicator import metric_progress_indicator
20
20
  from deepeval.metrics.prompt_alignment import schema as paschema
21
21
  from deepeval.config.settings import get_settings
22
22
 
23
+ from deepeval.metrics.api import metric_data_manager
24
+
23
25
 
24
26
  class PromptAlignmentMetric(BaseMetric):
25
27
 
@@ -55,6 +57,7 @@ class PromptAlignmentMetric(BaseMetric):
55
57
  test_case: LLMTestCase,
56
58
  _show_indicator: bool = True,
57
59
  _in_component: bool = False,
60
+ _log_metric_to_confident: bool = True,
58
61
  ) -> float:
59
62
 
60
63
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -93,6 +96,10 @@ class PromptAlignmentMetric(BaseMetric):
93
96
  f"Score: {self.score}\nReason: {self.reason}",
94
97
  ],
95
98
  )
99
+ if _log_metric_to_confident:
100
+ metric_data_manager.post_metric_if_enabled(
101
+ self, test_case=test_case
102
+ )
96
103
 
97
104
  return self.score
98
105
 
@@ -101,6 +108,7 @@ class PromptAlignmentMetric(BaseMetric):
101
108
  test_case: LLMTestCase,
102
109
  _show_indicator: bool = True,
103
110
  _in_component: bool = False,
111
+ _log_metric_to_confident: bool = True,
104
112
  ) -> float:
105
113
 
106
114
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -128,7 +136,10 @@ class PromptAlignmentMetric(BaseMetric):
128
136
  f"Score: {self.score}\nReason: {self.reason}",
129
137
  ],
130
138
  )
131
-
139
+ if _log_metric_to_confident:
140
+ metric_data_manager.post_metric_if_enabled(
141
+ self, test_case=test_case
142
+ )
132
143
  return self.score
133
144
 
134
145
  async def _a_generate_reason(self, input: str, actual_output: str) -> str:
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, Union, List
2
2
 
3
3
  from deepeval.metrics import BaseConversationalMetric
4
+ from deepeval.metrics.api import metric_data_manager
4
5
  from deepeval.metrics.role_adherence.schema import (
5
6
  OutOfCharacterResponseVerdicts,
6
7
  )
@@ -44,6 +45,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
44
45
  test_case: ConversationalTestCase,
45
46
  _show_indicator: bool = True,
46
47
  _in_component: bool = False,
48
+ _log_metric_to_confident: bool = True,
47
49
  ):
48
50
  check_conversational_test_case_params(
49
51
  test_case,
@@ -63,6 +65,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
63
65
  test_case,
64
66
  _show_indicator=False,
65
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
66
69
  )
67
70
  )
68
71
  else:
@@ -82,6 +85,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
82
85
  f"Score: {self.score}\nReason: {self.reason}",
83
86
  ],
84
87
  )
88
+ if _log_metric_to_confident:
89
+ metric_data_manager.post_metric_if_enabled(
90
+ self, test_case=test_case
91
+ )
85
92
  return self.score
86
93
 
87
94
  async def a_measure(
@@ -89,6 +96,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
89
96
  test_case: ConversationalTestCase,
90
97
  _show_indicator: bool = True,
91
98
  _in_component: bool = False,
99
+ _log_metric_to_confident: bool = True,
92
100
  ) -> float:
93
101
  check_conversational_test_case_params(
94
102
  test_case,
@@ -124,6 +132,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
124
132
  f"Score: {self.score}\nReason: {self.reason}",
125
133
  ],
126
134
  )
135
+ if _log_metric_to_confident:
136
+ metric_data_manager.post_metric_if_enabled(
137
+ self, test_case=test_case
138
+ )
127
139
  return self.score
128
140
 
129
141
  async def _a_generate_reason(self, role: str) -> str:
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.role_violation.template import RoleViolationTemplate
19
19
  from deepeval.metrics.role_violation.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class RoleViolationMetric(BaseMetric):
@@ -58,6 +59,7 @@ class RoleViolationMetric(BaseMetric):
58
59
  test_case: LLMTestCase,
59
60
  _show_indicator: bool = True,
60
61
  _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
61
63
  ) -> float:
62
64
 
63
65
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -73,6 +75,7 @@ class RoleViolationMetric(BaseMetric):
73
75
  test_case,
74
76
  _show_indicator=False,
75
77
  _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
76
79
  )
77
80
  )
78
81
  else:
@@ -94,6 +97,10 @@ class RoleViolationMetric(BaseMetric):
94
97
  f"Score: {self.score}\nReason: {self.reason}",
95
98
  ],
96
99
  )
100
+ if _log_metric_to_confident:
101
+ metric_data_manager.post_metric_if_enabled(
102
+ self, test_case=test_case
103
+ )
97
104
 
98
105
  return self.score
99
106
 
@@ -102,6 +109,7 @@ class RoleViolationMetric(BaseMetric):
102
109
  test_case: LLMTestCase,
103
110
  _show_indicator: bool = True,
104
111
  _in_component: bool = False,
112
+ _log_metric_to_confident: bool = True,
105
113
  ) -> float:
106
114
 
107
115
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -131,6 +139,10 @@ class RoleViolationMetric(BaseMetric):
131
139
  f"Score: {self.score}\nReason: {self.reason}",
132
140
  ],
133
141
  )
142
+ if _log_metric_to_confident:
143
+ metric_data_manager.post_metric_if_enabled(
144
+ self, test_case=test_case
145
+ )
134
146
 
135
147
  return self.score
136
148
 
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Union
2
2
  import asyncio
3
3
 
4
+ from deepeval.metrics.api import metric_data_manager
4
5
  from deepeval.test_case import (
5
6
  LLMTestCase,
6
7
  LLMTestCaseParams,
@@ -73,6 +74,7 @@ class SummarizationMetric(BaseMetric):
73
74
  test_case: LLMTestCase,
74
75
  _show_indicator: bool = True,
75
76
  _in_component: bool = False,
77
+ _log_metric_to_confident: bool = True,
76
78
  ) -> float:
77
79
 
78
80
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -88,6 +90,7 @@ class SummarizationMetric(BaseMetric):
88
90
  test_case,
89
91
  _show_indicator=False,
90
92
  _in_component=_in_component,
93
+ _log_metric_to_confident=_log_metric_to_confident,
91
94
  )
92
95
  )
93
96
  else:
@@ -121,7 +124,10 @@ class SummarizationMetric(BaseMetric):
121
124
  f"Score: {self.score}\nReason: {self.reason}",
122
125
  ],
123
126
  )
124
-
127
+ if _log_metric_to_confident:
128
+ metric_data_manager.post_metric_if_enabled(
129
+ self, test_case=test_case
130
+ )
125
131
  return self.score
126
132
 
127
133
  async def a_measure(
@@ -129,6 +135,7 @@ class SummarizationMetric(BaseMetric):
129
135
  test_case: LLMTestCase,
130
136
  _show_indicator: bool = True,
131
137
  _in_component: bool = False,
138
+ _log_metric_to_confident: bool = True,
132
139
  ) -> float:
133
140
 
134
141
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -171,6 +178,10 @@ class SummarizationMetric(BaseMetric):
171
178
  f"Score: {self.score}\nReason: {self.reason}",
172
179
  ],
173
180
  )
181
+ if _log_metric_to_confident:
182
+ metric_data_manager.post_metric_if_enabled(
183
+ self, test_case=test_case
184
+ )
174
185
 
175
186
  return self.score
176
187
 
@@ -50,6 +50,7 @@ class TaskCompletionMetric(BaseMetric):
50
50
  test_case: LLMTestCase,
51
51
  _show_indicator: bool = True,
52
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
53
54
  ) -> float:
54
55
  has_trace: bool = isinstance(test_case._trace_dict, Dict)
55
56
  if not has_trace:
@@ -66,6 +67,7 @@ class TaskCompletionMetric(BaseMetric):
66
67
  test_case,
67
68
  _show_indicator=False,
68
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
69
71
  )
70
72
  )
71
73
  else:
@@ -89,6 +91,7 @@ class TaskCompletionMetric(BaseMetric):
89
91
  test_case: LLMTestCase,
90
92
  _show_indicator: bool = True,
91
93
  _in_component: bool = False,
94
+ _log_metric_to_confident: bool = True,
92
95
  ) -> float:
93
96
  has_trace: bool = isinstance(test_case._trace_dict, Dict)
94
97
  if not has_trace:
@@ -12,6 +12,7 @@ from deepeval.test_case import (
12
12
  ToolCall,
13
13
  )
14
14
  from deepeval.metrics import BaseMetric
15
+ from deepeval.metrics.api import metric_data_manager
15
16
 
16
17
 
17
18
  class ToolCorrectnessMetric(BaseMetric):
@@ -45,6 +46,7 @@ class ToolCorrectnessMetric(BaseMetric):
45
46
  test_case: LLMTestCase,
46
47
  _show_indicator: bool = True,
47
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
48
50
  ) -> float:
49
51
 
50
52
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -83,6 +85,11 @@ class ToolCorrectnessMetric(BaseMetric):
83
85
  ]
84
86
  steps.append(f"Score: {self.score}\nReason: {self.reason}")
85
87
  self.verbose_logs = construct_verbose_logs(self, steps=steps)
88
+
89
+ if _log_metric_to_confident:
90
+ metric_data_manager.post_metric_if_enabled(
91
+ self, test_case=test_case
92
+ )
86
93
  return self.score
87
94
 
88
95
  async def a_measure(
@@ -90,6 +97,7 @@ class ToolCorrectnessMetric(BaseMetric):
90
97
  test_case: LLMTestCase,
91
98
  _show_indicator: bool = True,
92
99
  _in_component: bool = False,
100
+ _log_metric_to_confident: bool = True,
93
101
  ) -> float:
94
102
  return self.measure(
95
103
  test_case,
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.toxicity.template import ToxicityTemplate
19
19
  from deepeval.metrics.toxicity.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class ToxicityMetric(BaseMetric):
@@ -50,6 +51,7 @@ class ToxicityMetric(BaseMetric):
50
51
  test_case: LLMTestCase,
51
52
  _show_indicator: bool = True,
52
53
  _in_component: bool = False,
54
+ _log_metric_to_confident: bool = True,
53
55
  ) -> float:
54
56
 
55
57
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -65,6 +67,7 @@ class ToxicityMetric(BaseMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -84,6 +87,10 @@ class ToxicityMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class ToxicityMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,6 +130,10 @@ class ToxicityMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
125
137
 
126
138
  return self.score
127
139
 
@@ -20,6 +20,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
20
20
  from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
21
21
  from deepeval.utils import get_or_create_event_loop, prettify_list
22
22
  from deepeval.metrics.turn_relevancy.schema import *
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class TurnRelevancyMetric(BaseConversationalMetric):
@@ -49,6 +50,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
49
50
  test_case: ConversationalTestCase,
50
51
  _show_indicator: bool = True,
51
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
52
54
  ):
53
55
  check_conversational_test_case_params(
54
56
  test_case, self._required_test_case_params, self
@@ -65,6 +67,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -91,6 +94,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
91
94
  f"Score: {self.score}\nReason: {self.reason}",
92
95
  ],
93
96
  )
97
+ if _log_metric_to_confident:
98
+ metric_data_manager.post_metric_if_enabled(
99
+ self, test_case=test_case
100
+ )
94
101
  return self.score
95
102
 
96
103
  async def a_measure(
@@ -98,6 +105,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
98
105
  test_case: ConversationalTestCase,
99
106
  _show_indicator: bool = True,
100
107
  _in_component: bool = False,
108
+ _log_metric_to_confident: bool = True,
101
109
  ) -> float:
102
110
  check_conversational_test_case_params(
103
111
  test_case, self._required_test_case_params, self
@@ -134,6 +142,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
134
142
  f"Score: {self.score}\nReason: {self.reason}",
135
143
  ],
136
144
  )
145
+ if _log_metric_to_confident:
146
+ metric_data_manager.post_metric_if_enabled(
147
+ self, test_case=test_case
148
+ )
137
149
  return self.score
138
150
 
139
151
  async def _a_generate_reason(self) -> str:
@@ -56,8 +56,8 @@ model_pricing = {
56
56
  class GrokModel(DeepEvalBaseLLM):
57
57
  def __init__(
58
58
  self,
59
- api_key: Optional[str] = None,
60
59
  model: Optional[str] = None,
60
+ api_key: Optional[str] = None,
61
61
  temperature: float = 0,
62
62
  generation_kwargs: Optional[Dict] = None,
63
63
  **kwargs,
@@ -70,6 +70,8 @@ unsupported_log_probs_gpt_models = [
70
70
  "o1-mini-2024-09-12",
71
71
  "o3-mini",
72
72
  "o3-mini-2025-01-31",
73
+ "o4-mini",
74
+ "o4-mini-2025-04-16",
73
75
  "gpt-4.5-preview-2025-02-27",
74
76
  "gpt-5",
75
77
  "gpt-5-2025-08-07",
@@ -1,37 +1,19 @@
1
- from importlib.machinery import SourceFileLoader
2
- import importlib.util
3
- import sys
4
-
5
- from deepeval.openai.patch import patch_openai
1
+ try:
2
+ import openai # noqa: F401
3
+ except ImportError:
4
+ raise ModuleNotFoundError(
5
+ "Please install OpenAI to use this feature: 'pip install openai'"
6
+ )
6
7
 
7
8
 
8
- def load_and_patch_openai():
9
- openai_spec = importlib.util.find_spec("openai")
10
- if not openai_spec or not openai_spec.origin:
11
- raise ImportError("Could not find the OpenAI package")
12
- package_dirs = openai_spec.submodule_search_locations
13
- loader = SourceFileLoader("deepeval_openai", openai_spec.origin)
14
- new_spec = importlib.util.spec_from_loader(
15
- "deepeval_openai",
16
- loader,
17
- origin=openai_spec.origin,
18
- is_package=True,
19
- )
20
- deepeval_openai = importlib.util.module_from_spec(new_spec)
21
- deepeval_openai.__path__ = package_dirs
22
- sys.modules["deepeval_openai"] = deepeval_openai
23
- loader.exec_module(deepeval_openai)
24
- patch_openai(deepeval_openai)
25
- return deepeval_openai
9
+ try:
10
+ from openai import OpenAI, AsyncOpenAI # noqa: F401
11
+ except ImportError:
12
+ OpenAI = None # type: ignore
13
+ AsyncOpenAI = None # type: ignore
26
14
 
27
15
 
28
- patched_openai = load_and_patch_openai()
29
- openai = patched_openai
30
- OpenAI = patched_openai.OpenAI
31
- AsyncOpenAI = patched_openai.AsyncOpenAI
16
+ if OpenAI or AsyncOpenAI:
17
+ from deepeval.openai.patch import patch_openai_classes
32
18
 
33
- __all__ = [
34
- "openai",
35
- "OpenAI",
36
- "AsyncOpenAI",
37
- ]
19
+ patch_openai_classes()