deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  4. deepeval/cli/main.py +42 -0
  5. deepeval/confident/api.py +1 -0
  6. deepeval/config/logging.py +33 -0
  7. deepeval/config/settings.py +176 -16
  8. deepeval/constants.py +8 -1
  9. deepeval/dataset/dataset.py +2 -11
  10. deepeval/dataset/utils.py +1 -1
  11. deepeval/evaluate/evaluate.py +5 -1
  12. deepeval/evaluate/execute.py +118 -60
  13. deepeval/evaluate/utils.py +20 -116
  14. deepeval/integrations/crewai/__init__.py +6 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/wrapper.py +45 -5
  18. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  19. deepeval/metrics/api.py +281 -0
  20. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  21. deepeval/metrics/bias/bias.py +12 -3
  22. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  23. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  24. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  25. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  26. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  27. deepeval/metrics/conversational_dag/nodes.py +12 -4
  28. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  29. deepeval/metrics/dag/dag.py +12 -0
  30. deepeval/metrics/dag/nodes.py +12 -4
  31. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  32. deepeval/metrics/g_eval/g_eval.py +37 -15
  33. deepeval/metrics/hallucination/hallucination.py +12 -1
  34. deepeval/metrics/indicator.py +8 -2
  35. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  36. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  37. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  38. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  39. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  40. deepeval/metrics/misuse/misuse.py +12 -1
  41. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  43. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  44. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  45. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  47. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  50. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  51. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  52. deepeval/metrics/non_advice/non_advice.py +12 -0
  53. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  54. deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
  55. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  56. deepeval/metrics/role_violation/role_violation.py +12 -0
  57. deepeval/metrics/summarization/summarization.py +12 -1
  58. deepeval/metrics/task_completion/task_completion.py +3 -0
  59. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  60. deepeval/metrics/toxicity/toxicity.py +12 -0
  61. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  62. deepeval/models/llms/grok_model.py +1 -1
  63. deepeval/models/llms/openai_model.py +2 -0
  64. deepeval/models/retry_policy.py +202 -11
  65. deepeval/openai/__init__.py +14 -32
  66. deepeval/openai/extractors.py +24 -34
  67. deepeval/openai/patch.py +256 -161
  68. deepeval/openai/types.py +20 -0
  69. deepeval/openai/utils.py +98 -56
  70. deepeval/prompt/__init__.py +19 -1
  71. deepeval/prompt/api.py +160 -0
  72. deepeval/prompt/prompt.py +244 -62
  73. deepeval/prompt/utils.py +144 -2
  74. deepeval/synthesizer/chunking/context_generator.py +209 -152
  75. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  76. deepeval/synthesizer/synthesizer.py +8 -5
  77. deepeval/test_case/api.py +131 -0
  78. deepeval/test_run/__init__.py +1 -0
  79. deepeval/test_run/hyperparameters.py +47 -8
  80. deepeval/test_run/test_run.py +104 -1
  81. deepeval/tracing/api.py +3 -1
  82. deepeval/tracing/message_types/__init__.py +10 -0
  83. deepeval/tracing/message_types/base.py +6 -0
  84. deepeval/tracing/message_types/messages.py +14 -0
  85. deepeval/tracing/message_types/tools.py +18 -0
  86. deepeval/tracing/otel/exporter.py +0 -6
  87. deepeval/tracing/otel/utils.py +58 -8
  88. deepeval/tracing/trace_context.py +73 -4
  89. deepeval/tracing/trace_test_manager.py +19 -0
  90. deepeval/tracing/tracing.py +52 -4
  91. deepeval/tracing/types.py +16 -0
  92. deepeval/tracing/utils.py +8 -0
  93. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  94. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
  95. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  96. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  97. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -46,13 +46,16 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
46
46
  test_case: MLLMTestCase,
47
47
  _show_indicator: bool = True,
48
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
49
50
  ) -> float:
50
51
  check_mllm_test_case_params(
51
52
  test_case, self._required_params, None, None, self
52
53
  )
53
54
  self.evaluation_cost = 0 if self.using_native_model else None
54
55
  with metric_progress_indicator(
55
- self, _show_indicator=_show_indicator, _in_component=_in_component
56
+ self,
57
+ _show_indicator=_show_indicator,
58
+ _in_component=_in_component,
56
59
  ):
57
60
  if self.async_mode:
58
61
  loop = get_or_create_event_loop()
@@ -61,6 +64,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
61
64
  test_case,
62
65
  _show_indicator=False,
63
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
64
68
  )
65
69
  )
66
70
  else:
@@ -89,6 +93,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
89
93
  test_case: MLLMTestCase,
90
94
  _show_indicator: bool = True,
91
95
  _in_component: bool = False,
96
+ _log_metric_to_confident: bool = True,
92
97
  ) -> float:
93
98
  check_mllm_test_case_params(
94
99
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -56,7 +57,9 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
56
57
 
57
58
  self.evaluation_cost = 0 if self.using_native_model else None
58
59
  with metric_progress_indicator(
59
- self, _show_indicator=_show_indicator, _in_component=_in_component
60
+ self,
61
+ _show_indicator=_show_indicator,
62
+ _in_component=_in_component,
60
63
  ):
61
64
  if self.async_mode:
62
65
  loop = get_or_create_event_loop()
@@ -65,6 +68,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
65
68
  test_case,
66
69
  _show_indicator=False,
67
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
68
72
  )
69
73
  )
70
74
  else:
@@ -93,6 +97,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
93
97
  test_case: MLLMTestCase,
94
98
  _show_indicator: bool = True,
95
99
  _in_component: bool = False,
100
+ _log_metric_to_confident: bool = True,
96
101
  ) -> float:
97
102
  check_mllm_test_case_params(
98
103
  test_case, self._required_params, None, None, self
@@ -48,6 +48,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
48
48
  test_case: MLLMTestCase,
49
49
  _show_indicator: bool = True,
50
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
51
52
  ) -> float:
52
53
  check_mllm_test_case_params(
53
54
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -90,6 +92,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
90
92
  test_case: MLLMTestCase,
91
93
  _show_indicator: bool = True,
92
94
  _in_component: bool = False,
95
+ _log_metric_to_confident: bool = True,
93
96
  ) -> float:
94
97
  check_mllm_test_case_params(
95
98
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -65,6 +66,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
65
66
  test_case,
66
67
  _show_indicator=False,
67
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
68
70
  )
69
71
  )
70
72
  else:
@@ -90,6 +92,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
90
92
  test_case: MLLMTestCase,
91
93
  _show_indicator: bool = True,
92
94
  _in_component: bool = False,
95
+ _log_metric_to_confident: bool = True,
93
96
  ) -> float:
94
97
  check_mllm_test_case_params(
95
98
  test_case, self._required_params, None, None, self
@@ -53,6 +53,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
53
53
  test_case: MLLMTestCase,
54
54
  _show_indicator: bool = True,
55
55
  _in_component: bool = False,
56
+ _log_metric_to_confident: bool = True,
56
57
  ) -> float:
57
58
  check_mllm_test_case_params(
58
59
  test_case, self._required_params, None, None, self
@@ -71,6 +72,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
71
72
  test_case,
72
73
  _show_indicator=False,
73
74
  _in_component=_in_component,
75
+ _log_metric_to_confident=_log_metric_to_confident,
74
76
  )
75
77
  )
76
78
  else:
@@ -97,6 +99,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
97
99
  test_case: MLLMTestCase,
98
100
  _show_indicator: bool = True,
99
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
100
103
  ) -> float:
101
104
  check_mllm_test_case_params(
102
105
  test_case, self._required_params, None, None, self
@@ -78,6 +78,7 @@ class MultimodalGEval(BaseMultimodalMetric):
78
78
  test_case: MLLMTestCase,
79
79
  _show_indicator: bool = True,
80
80
  _in_component: bool = False,
81
+ _log_metric_to_confident: bool = True,
81
82
  _additional_context: Optional[str] = None,
82
83
  ) -> float:
83
84
 
@@ -96,6 +97,7 @@ class MultimodalGEval(BaseMultimodalMetric):
96
97
  test_case,
97
98
  _show_indicator=False,
98
99
  _in_component=_in_component,
100
+ _log_metric_to_confident=_log_metric_to_confident,
99
101
  _additional_context=_additional_context,
100
102
  )
101
103
  )
@@ -132,6 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
132
134
  _show_indicator: bool = True,
133
135
  _in_component: bool = False,
134
136
  _additional_context: Optional[str] = None,
137
+ _log_metric_to_confident: bool = True,
135
138
  ) -> float:
136
139
 
137
140
  check_mllm_test_case_params(
@@ -3,7 +3,7 @@ from typing import List, Dict
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- check_llm_test_case_params,
6
+ check_mllm_test_case_params,
7
7
  )
8
8
  from deepeval.test_case import (
9
9
  MLLMTestCase,
@@ -11,10 +11,10 @@ from deepeval.test_case import (
11
11
  ToolCallParams,
12
12
  ToolCall,
13
13
  )
14
- from deepeval.metrics import BaseMetric
14
+ from deepeval.metrics import BaseMultimodalMetric
15
15
 
16
16
 
17
- class MultimodalToolCorrectnessMetric(BaseMetric):
17
+ class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
18
18
 
19
19
  _required_params: List[MLLMTestCaseParams] = [
20
20
  MLLMTestCaseParams.INPUT,
@@ -46,8 +46,11 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
46
46
  test_case: MLLMTestCase,
47
47
  _show_indicator: bool = True,
48
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
49
50
  ) -> float:
50
- check_llm_test_case_params(test_case, self._required_params, self)
51
+ check_mllm_test_case_params(
52
+ test_case, self._required_params, None, None, self
53
+ )
51
54
  self.test_case = test_case
52
55
  with metric_progress_indicator(
53
56
  self, _show_indicator=_show_indicator, _in_component=_in_component
@@ -90,11 +93,13 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
90
93
  test_case: MLLMTestCase,
91
94
  _show_indicator: bool = True,
92
95
  _in_component: bool = False,
96
+ _log_metric_to_confident: bool = True,
93
97
  ) -> float:
94
98
  return self.measure(
95
99
  test_case,
96
100
  _show_indicator=_show_indicator,
97
101
  _in_component=_in_component,
102
+ _log_metric_to_confident=_log_metric_to_confident,
98
103
  )
99
104
 
100
105
  ##################################################
@@ -278,7 +283,7 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
278
283
 
279
284
  @property
280
285
  def __name__(self):
281
- return "Tool Correctness"
286
+ return "Multi Modal Tool Correctness"
282
287
 
283
288
  def indent_multiline_string(self, s, indent_level=4):
284
289
  indent = " " * indent_level
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.non_advice.template import NonAdviceTemplate
19
19
  from deepeval.metrics.non_advice.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class NonAdviceMetric(BaseMetric):
@@ -58,6 +59,7 @@ class NonAdviceMetric(BaseMetric):
58
59
  test_case: LLMTestCase,
59
60
  _show_indicator: bool = True,
60
61
  _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
61
63
  ) -> float:
62
64
 
63
65
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -73,6 +75,7 @@ class NonAdviceMetric(BaseMetric):
73
75
  test_case,
74
76
  _show_indicator=False,
75
77
  _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
76
79
  )
77
80
  )
78
81
  else:
@@ -93,6 +96,10 @@ class NonAdviceMetric(BaseMetric):
93
96
  f"Score: {self.score}\nReason: {self.reason}",
94
97
  ],
95
98
  )
99
+ if _log_metric_to_confident:
100
+ metric_data_manager.post_metric_if_enabled(
101
+ self, test_case=test_case
102
+ )
96
103
 
97
104
  return self.score
98
105
 
@@ -101,6 +108,7 @@ class NonAdviceMetric(BaseMetric):
101
108
  test_case: LLMTestCase,
102
109
  _show_indicator: bool = True,
103
110
  _in_component: bool = False,
111
+ _log_metric_to_confident: bool = True,
104
112
  ) -> float:
105
113
 
106
114
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -129,6 +137,10 @@ class NonAdviceMetric(BaseMetric):
129
137
  f"Score: {self.score}\nReason: {self.reason}",
130
138
  ],
131
139
  )
140
+ if _log_metric_to_confident:
141
+ metric_data_manager.post_metric_if_enabled(
142
+ self, test_case=test_case
143
+ )
132
144
 
133
145
  return self.score
134
146
 
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.pii_leakage.template import PIILeakageTemplate
19
19
  from deepeval.metrics.pii_leakage.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class PIILeakageMetric(BaseMetric):
@@ -49,6 +50,7 @@ class PIILeakageMetric(BaseMetric):
49
50
  test_case: LLMTestCase,
50
51
  _show_indicator: bool = True,
51
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
52
54
  ) -> float:
53
55
 
54
56
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -64,6 +66,7 @@ class PIILeakageMetric(BaseMetric):
64
66
  test_case,
65
67
  _show_indicator=False,
66
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
67
70
  )
68
71
  )
69
72
  else:
@@ -84,6 +87,10 @@ class PIILeakageMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class PIILeakageMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -120,7 +128,10 @@ class PIILeakageMetric(BaseMetric):
120
128
  f"Score: {self.score}\nReason: {self.reason}",
121
129
  ],
122
130
  )
123
-
131
+ if _log_metric_to_confident:
132
+ metric_data_manager.post_metric_if_enabled(
133
+ self, test_case=test_case
134
+ )
124
135
  return self.score
125
136
 
126
137
  async def _a_generate_reason(self) -> str:
@@ -1,3 +1,5 @@
1
+ import asyncio
2
+
1
3
  from typing import Optional, List, Union
2
4
 
3
5
  from deepeval.utils import get_or_create_event_loop, prettify_list
@@ -15,7 +17,10 @@ from deepeval.metrics import BaseMetric
15
17
  from deepeval.models import DeepEvalBaseLLM
16
18
  from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
17
19
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.prompt_alignment.schema import *
20
+ from deepeval.metrics.prompt_alignment import schema as paschema
21
+ from deepeval.config.settings import get_settings
22
+
23
+ from deepeval.metrics.api import metric_data_manager
19
24
 
20
25
 
21
26
  class PromptAlignmentMetric(BaseMetric):
@@ -52,6 +57,7 @@ class PromptAlignmentMetric(BaseMetric):
52
57
  test_case: LLMTestCase,
53
58
  _show_indicator: bool = True,
54
59
  _in_component: bool = False,
60
+ _log_metric_to_confident: bool = True,
55
61
  ) -> float:
56
62
 
57
63
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -62,15 +68,19 @@ class PromptAlignmentMetric(BaseMetric):
62
68
  ):
63
69
  if self.async_mode:
64
70
  loop = get_or_create_event_loop()
71
+ coro = self.a_measure(
72
+ test_case,
73
+ _show_indicator=False,
74
+ _in_component=_in_component,
75
+ )
65
76
  loop.run_until_complete(
66
- self.a_measure(
67
- test_case,
68
- _show_indicator=False,
69
- _in_component=_in_component,
77
+ asyncio.wait_for(
78
+ coro,
79
+ timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
70
80
  )
71
81
  )
72
82
  else:
73
- self.verdicts: Verdicts = self._generate_verdicts(
83
+ self.verdicts: paschema.Verdicts = self._generate_verdicts(
74
84
  test_case.input, test_case.actual_output
75
85
  )
76
86
  self.score = self._calculate_score()
@@ -86,6 +96,10 @@ class PromptAlignmentMetric(BaseMetric):
86
96
  f"Score: {self.score}\nReason: {self.reason}",
87
97
  ],
88
98
  )
99
+ if _log_metric_to_confident:
100
+ metric_data_manager.post_metric_if_enabled(
101
+ self, test_case=test_case
102
+ )
89
103
 
90
104
  return self.score
91
105
 
@@ -94,6 +108,7 @@ class PromptAlignmentMetric(BaseMetric):
94
108
  test_case: LLMTestCase,
95
109
  _show_indicator: bool = True,
96
110
  _in_component: bool = False,
111
+ _log_metric_to_confident: bool = True,
97
112
  ) -> float:
98
113
 
99
114
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -105,7 +120,7 @@ class PromptAlignmentMetric(BaseMetric):
105
120
  _show_indicator=_show_indicator,
106
121
  _in_component=_in_component,
107
122
  ):
108
- self.verdicts: Verdicts = await self._a_generate_verdicts(
123
+ self.verdicts: paschema.Verdicts = await self._a_generate_verdicts(
109
124
  test_case.input, test_case.actual_output
110
125
  )
111
126
  self.score = self._calculate_score()
@@ -121,7 +136,10 @@ class PromptAlignmentMetric(BaseMetric):
121
136
  f"Score: {self.score}\nReason: {self.reason}",
122
137
  ],
123
138
  )
124
-
139
+ if _log_metric_to_confident:
140
+ metric_data_manager.post_metric_if_enabled(
141
+ self, test_case=test_case
142
+ )
125
143
  return self.score
126
144
 
127
145
  async def _a_generate_reason(self, input: str, actual_output: str) -> str:
@@ -141,14 +159,17 @@ class PromptAlignmentMetric(BaseMetric):
141
159
  )
142
160
  if self.using_native_model:
143
161
  res, cost = await self.model.a_generate(
144
- prompt, schema=PromptAlignmentScoreReason
162
+ prompt, schema=paschema.PromptAlignmentScoreReason
145
163
  )
146
164
  self.evaluation_cost += cost
147
165
  return res.reason
148
166
  else:
149
167
  try:
150
- res: PromptAlignmentScoreReason = await self.model.a_generate(
151
- prompt=prompt, schema=PromptAlignmentScoreReason
168
+ res: paschema.PromptAlignmentScoreReason = (
169
+ await self.model.a_generate(
170
+ prompt=prompt,
171
+ schema=paschema.PromptAlignmentScoreReason,
172
+ )
152
173
  )
153
174
  return res.reason
154
175
  except TypeError:
@@ -173,14 +194,14 @@ class PromptAlignmentMetric(BaseMetric):
173
194
  )
174
195
  if self.using_native_model:
175
196
  res, cost = self.model.generate(
176
- prompt, schema=PromptAlignmentScoreReason
197
+ prompt, schema=paschema.PromptAlignmentScoreReason
177
198
  )
178
199
  self.evaluation_cost += cost
179
200
  return res.reason
180
201
  else:
181
202
  try:
182
- res: PromptAlignmentScoreReason = self.model.generate(
183
- prompt=prompt, schema=PromptAlignmentScoreReason
203
+ res: paschema.PromptAlignmentScoreReason = self.model.generate(
204
+ prompt=prompt, schema=paschema.PromptAlignmentScoreReason
184
205
  )
185
206
  return res.reason
186
207
  except TypeError:
@@ -190,48 +211,56 @@ class PromptAlignmentMetric(BaseMetric):
190
211
 
191
212
  async def _a_generate_verdicts(
192
213
  self, input: str, actual_output: str
193
- ) -> Verdicts:
214
+ ) -> paschema.Verdicts:
194
215
  prompt = PromptAlignmentTemplate.generate_verdicts(
195
216
  prompt_instructions=self.prompt_instructions,
196
217
  input=input,
197
218
  actual_output=actual_output,
198
219
  )
199
220
  if self.using_native_model:
200
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
221
+ res, cost = await self.model.a_generate(
222
+ prompt, schema=paschema.Verdicts
223
+ )
201
224
  self.evaluation_cost += cost
202
225
  return [item for item in res.verdicts]
203
226
  else:
204
227
  try:
205
- res: Verdicts = await self.model.a_generate(
206
- prompt, schema=Verdicts
228
+ res: paschema.Verdicts = await self.model.a_generate(
229
+ prompt, schema=paschema.Verdicts
207
230
  )
208
231
  return [item for item in res.verdicts]
209
232
  except TypeError:
210
233
  res = await self.model.a_generate(prompt)
211
234
  data = trimAndLoadJson(res, self)
212
235
  return [
213
- PromptAlignmentVerdict(**item) for item in data["verdicts"]
236
+ paschema.PromptAlignmentVerdict(**item)
237
+ for item in data["verdicts"]
214
238
  ]
215
239
 
216
- def _generate_verdicts(self, input: str, actual_output: str) -> Verdicts:
240
+ def _generate_verdicts(
241
+ self, input: str, actual_output: str
242
+ ) -> paschema.Verdicts:
217
243
  prompt = PromptAlignmentTemplate.generate_verdicts(
218
244
  prompt_instructions=self.prompt_instructions,
219
245
  input=input,
220
246
  actual_output=actual_output,
221
247
  )
222
248
  if self.using_native_model:
223
- res, cost = self.model.generate(prompt, schema=Verdicts)
249
+ res, cost = self.model.generate(prompt, schema=paschema.Verdicts)
224
250
  self.evaluation_cost += cost
225
251
  return [item for item in res.verdicts]
226
252
  else:
227
253
  try:
228
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
254
+ res: paschema.Verdicts = self.model.generate(
255
+ prompt, schema=paschema.Verdicts
256
+ )
229
257
  return [item for item in res.verdicts]
230
258
  except TypeError:
231
259
  res = self.model.generate(prompt)
232
260
  data = trimAndLoadJson(res, self)
233
261
  return [
234
- PromptAlignmentVerdict(**item) for item in data["verdicts"]
262
+ paschema.PromptAlignmentVerdict(**item)
263
+ for item in data["verdicts"]
235
264
  ]
236
265
 
237
266
  def _calculate_score(self):
@@ -253,7 +282,7 @@ class PromptAlignmentMetric(BaseMetric):
253
282
  else:
254
283
  try:
255
284
  self.success = self.score >= self.threshold
256
- except:
285
+ except TypeError:
257
286
  self.success = False
258
287
  return self.success
259
288
 
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, Union, List
2
2
 
3
3
  from deepeval.metrics import BaseConversationalMetric
4
+ from deepeval.metrics.api import metric_data_manager
4
5
  from deepeval.metrics.role_adherence.schema import (
5
6
  OutOfCharacterResponseVerdicts,
6
7
  )
@@ -44,6 +45,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
44
45
  test_case: ConversationalTestCase,
45
46
  _show_indicator: bool = True,
46
47
  _in_component: bool = False,
48
+ _log_metric_to_confident: bool = True,
47
49
  ):
48
50
  check_conversational_test_case_params(
49
51
  test_case,
@@ -63,6 +65,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
63
65
  test_case,
64
66
  _show_indicator=False,
65
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
66
69
  )
67
70
  )
68
71
  else:
@@ -82,6 +85,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
82
85
  f"Score: {self.score}\nReason: {self.reason}",
83
86
  ],
84
87
  )
88
+ if _log_metric_to_confident:
89
+ metric_data_manager.post_metric_if_enabled(
90
+ self, test_case=test_case
91
+ )
85
92
  return self.score
86
93
 
87
94
  async def a_measure(
@@ -89,6 +96,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
89
96
  test_case: ConversationalTestCase,
90
97
  _show_indicator: bool = True,
91
98
  _in_component: bool = False,
99
+ _log_metric_to_confident: bool = True,
92
100
  ) -> float:
93
101
  check_conversational_test_case_params(
94
102
  test_case,
@@ -124,6 +132,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
124
132
  f"Score: {self.score}\nReason: {self.reason}",
125
133
  ],
126
134
  )
135
+ if _log_metric_to_confident:
136
+ metric_data_manager.post_metric_if_enabled(
137
+ self, test_case=test_case
138
+ )
127
139
  return self.score
128
140
 
129
141
  async def _a_generate_reason(self, role: str) -> str:
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.role_violation.template import RoleViolationTemplate
19
19
  from deepeval.metrics.role_violation.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class RoleViolationMetric(BaseMetric):
@@ -58,6 +59,7 @@ class RoleViolationMetric(BaseMetric):
58
59
  test_case: LLMTestCase,
59
60
  _show_indicator: bool = True,
60
61
  _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
61
63
  ) -> float:
62
64
 
63
65
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -73,6 +75,7 @@ class RoleViolationMetric(BaseMetric):
73
75
  test_case,
74
76
  _show_indicator=False,
75
77
  _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
76
79
  )
77
80
  )
78
81
  else:
@@ -94,6 +97,10 @@ class RoleViolationMetric(BaseMetric):
94
97
  f"Score: {self.score}\nReason: {self.reason}",
95
98
  ],
96
99
  )
100
+ if _log_metric_to_confident:
101
+ metric_data_manager.post_metric_if_enabled(
102
+ self, test_case=test_case
103
+ )
97
104
 
98
105
  return self.score
99
106
 
@@ -102,6 +109,7 @@ class RoleViolationMetric(BaseMetric):
102
109
  test_case: LLMTestCase,
103
110
  _show_indicator: bool = True,
104
111
  _in_component: bool = False,
112
+ _log_metric_to_confident: bool = True,
105
113
  ) -> float:
106
114
 
107
115
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -131,6 +139,10 @@ class RoleViolationMetric(BaseMetric):
131
139
  f"Score: {self.score}\nReason: {self.reason}",
132
140
  ],
133
141
  )
142
+ if _log_metric_to_confident:
143
+ metric_data_manager.post_metric_if_enabled(
144
+ self, test_case=test_case
145
+ )
134
146
 
135
147
  return self.score
136
148