deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,51 @@
1
+ from typing import List, Optional, Type, TypeVar
2
+ from pydantic import PrivateAttr
3
+
4
+ from deepeval.metrics.base_metric import BaseMetric
5
+
6
+ try:
7
+ from crewai import Crew, Agent, LLM
8
+
9
+ is_crewai_installed = True
10
+ except ImportError:
11
+ is_crewai_installed = False
12
+
13
+
14
+ def is_crewai_installed():
15
+ if not is_crewai_installed:
16
+ raise ImportError(
17
+ "CrewAI is not installed. Please install it with `pip install crewai`."
18
+ )
19
+
20
+
21
+ T = TypeVar("T")
22
+
23
+
24
+ def create_deepeval_class(base_class: Type[T], class_name: str) -> Type[T]:
25
+ """Factory function to create DeepEval-enabled CrewAI classes"""
26
+
27
+ class DeepEvalClass(base_class):
28
+ _metric_collection: Optional[str] = PrivateAttr(default=None)
29
+ _metrics: Optional[List[BaseMetric]] = PrivateAttr(default=None)
30
+
31
+ def __init__(
32
+ self,
33
+ *args,
34
+ metrics: Optional[List[BaseMetric]] = None,
35
+ metric_collection: Optional[str] = None,
36
+ **kwargs
37
+ ):
38
+ is_crewai_installed()
39
+ super().__init__(*args, **kwargs)
40
+ self._metric_collection = metric_collection
41
+ self._metrics = metrics
42
+
43
+ DeepEvalClass.__name__ = class_name
44
+ DeepEvalClass.__qualname__ = class_name
45
+ return DeepEvalClass
46
+
47
+
48
+ # Create the classes
49
+ DeepEvalCrew = create_deepeval_class(Crew, "DeepEvalCrew")
50
+ DeepEvalAgent = create_deepeval_class(Agent, "DeepEvalAgent")
51
+ DeepEvalLLM = create_deepeval_class(LLM, "DeepEvalLLM")
@@ -3,6 +3,7 @@ from crewai.crew import Crew
3
3
  from crewai.agent import Agent
4
4
  from functools import wraps
5
5
  from deepeval.tracing.tracing import Observer
6
+ from typing import Any
6
7
 
7
8
 
8
9
  def wrap_crew_kickoff():
@@ -10,7 +11,13 @@ def wrap_crew_kickoff():
10
11
 
11
12
  @wraps(original_kickoff)
12
13
  def wrapper(self, *args, **kwargs):
13
- with Observer(span_type="crew", func_name="kickoff"):
14
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
15
+ with Observer(
16
+ span_type="crew",
17
+ func_name="kickoff",
18
+ metric_collection=metric_collection,
19
+ metrics=metrics,
20
+ ):
14
21
  result = original_kickoff(self, *args, **kwargs)
15
22
 
16
23
  return result
@@ -23,7 +30,13 @@ def wrap_crew_kickoff_for_each():
23
30
 
24
31
  @wraps(original_kickoff_for_each)
25
32
  def wrapper(self, *args, **kwargs):
26
- with Observer(span_type="crew", func_name="kickoff_for_each"):
33
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
34
+ with Observer(
35
+ span_type="crew",
36
+ func_name="kickoff_for_each",
37
+ metric_collection=metric_collection,
38
+ metrics=metrics,
39
+ ):
27
40
  result = original_kickoff_for_each(self, *args, **kwargs)
28
41
 
29
42
  return result
@@ -36,7 +49,13 @@ def wrap_crew_kickoff_async():
36
49
 
37
50
  @wraps(original_kickoff_async)
38
51
  async def wrapper(self, *args, **kwargs):
39
- with Observer(span_type="crew", func_name="kickoff_async"):
52
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
53
+ with Observer(
54
+ span_type="crew",
55
+ func_name="kickoff_async",
56
+ metric_collection=metric_collection,
57
+ metrics=metrics,
58
+ ):
40
59
  result = await original_kickoff_async(self, *args, **kwargs)
41
60
 
42
61
  return result
@@ -49,7 +68,13 @@ def wrap_crew_kickoff_for_each_async():
49
68
 
50
69
  @wraps(original_kickoff_for_each_async)
51
70
  async def wrapper(self, *args, **kwargs):
52
- with Observer(span_type="crew", func_name="kickoff_for_each_async"):
71
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
72
+ with Observer(
73
+ span_type="crew",
74
+ func_name="kickoff_for_each_async",
75
+ metric_collection=metric_collection,
76
+ metrics=metrics,
77
+ ):
53
78
  result = await original_kickoff_for_each_async(
54
79
  self, *args, **kwargs
55
80
  )
@@ -64,10 +89,13 @@ def wrap_llm_call():
64
89
 
65
90
  @wraps(original_llm_call)
66
91
  def wrapper(self, *args, **kwargs):
92
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
67
93
  with Observer(
68
94
  span_type="llm",
69
95
  func_name="call",
70
96
  observe_kwargs={"model": "temp_model"},
97
+ metric_collection=metric_collection,
98
+ metrics=metrics,
71
99
  ):
72
100
  result = original_llm_call(self, *args, **kwargs)
73
101
  return result
@@ -80,8 +108,20 @@ def wrap_agent_execute_task():
80
108
 
81
109
  @wraps(original_execute_task)
82
110
  def wrapper(self, *args, **kwargs):
83
- with Observer(span_type="agent", func_name="execute_task"):
111
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
112
+ with Observer(
113
+ span_type="agent",
114
+ func_name="execute_task",
115
+ metric_collection=metric_collection,
116
+ metrics=metrics,
117
+ ):
84
118
  result = original_execute_task(self, *args, **kwargs)
85
119
  return result
86
120
 
87
121
  Agent.execute_task = wrapper
122
+
123
+
124
+ def _check_metrics_and_metric_collection(obj: Any):
125
+ metric_collection = getattr(obj, "_metric_collection", None)
126
+ metrics = getattr(obj, "_metrics", None)
127
+ return metric_collection, metrics
@@ -16,6 +16,7 @@ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
17
17
  from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.answer_relevancy.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class AnswerRelevancyMetric(BaseMetric):
@@ -50,8 +51,8 @@ class AnswerRelevancyMetric(BaseMetric):
50
51
  test_case: LLMTestCase,
51
52
  _show_indicator: bool = True,
52
53
  _in_component: bool = False,
54
+ _log_metric_to_confident: bool = True,
53
55
  ) -> float:
54
-
55
56
  check_llm_test_case_params(test_case, self._required_params, self)
56
57
 
57
58
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -65,6 +66,7 @@ class AnswerRelevancyMetric(BaseMetric):
65
66
  test_case,
66
67
  _show_indicator=False,
67
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
68
70
  )
69
71
  )
70
72
  else:
@@ -85,6 +87,10 @@ class AnswerRelevancyMetric(BaseMetric):
85
87
  f"Score: {self.score}\nReason: {self.reason}",
86
88
  ],
87
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
88
94
 
89
95
  return self.score
90
96
 
@@ -93,8 +99,8 @@ class AnswerRelevancyMetric(BaseMetric):
93
99
  test_case: LLMTestCase,
94
100
  _show_indicator: bool = True,
95
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
96
103
  ) -> float:
97
-
98
104
  check_llm_test_case_params(test_case, self._required_params, self)
99
105
 
100
106
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -121,7 +127,10 @@ class AnswerRelevancyMetric(BaseMetric):
121
127
  f"Score: {self.score}\nReason: {self.reason}",
122
128
  ],
123
129
  )
124
-
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
125
134
  return self.score
126
135
 
127
136
  async def _a_generate_reason(self, input: str) -> str:
@@ -0,0 +1,281 @@
1
+ from typing import Optional, Set, Any, Dict, List, Union
2
+ import threading
3
+ import asyncio
4
+ import queue
5
+ import atexit
6
+ from time import perf_counter
7
+ from enum import Enum
8
+ from pydantic import Field
9
+ from rich.console import Console
10
+
11
+ from deepeval.confident.api import Api, HttpMethods, Endpoints, is_confident
12
+ from deepeval.constants import (
13
+ CONFIDENT_METRIC_LOGGING_FLUSH,
14
+ CONFIDENT_METRIC_LOGGING_VERBOSE,
15
+ )
16
+ from deepeval.metrics.base_metric import BaseConversationalMetric, BaseMetric
17
+ from deepeval.test_case.conversational_test_case import ConversationalTestCase
18
+ from deepeval.test_case.llm_test_case import LLMTestCase
19
+ from deepeval.test_case.api import create_api_test_case
20
+ from deepeval.test_run.api import LLMApiTestCase, ConversationalApiTestCase
21
+ from deepeval.tracing.api import MetricData
22
+ from deepeval.config.settings import get_settings
23
+
24
+
25
+ class MetricWorkerStatus(Enum):
26
+ SUCCESS = "success"
27
+ FAILURE = "failure"
28
+ WARNING = "warning"
29
+
30
+
31
+ class ApiMetricData(MetricData):
32
+ llm_test_case: Optional[LLMApiTestCase] = Field(None, alias="llmTestCase")
33
+ conversational_test_case: Optional[ConversationalApiTestCase] = Field(
34
+ None, alias="conversationalTestCase"
35
+ )
36
+
37
+
38
+ class MetricDataManager:
39
+ """Manager for posting metric data asynchronously in background thread."""
40
+
41
+ def __init__(self):
42
+ settings = get_settings()
43
+ # Initialize queue and worker thread for metric posting
44
+ self._metric_queue = queue.Queue()
45
+ self._worker_thread = None
46
+ self._min_interval = 0.2 # Minimum time between API calls (seconds)
47
+ self._last_post_time = 0
48
+ self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
49
+ self._flush_enabled = bool(settings.CONFIDENT_METRIC_LOGGING_FLUSH)
50
+ self._daemon = not self._flush_enabled
51
+ self._thread_lock = threading.Lock()
52
+ self.metric_logging_enabled = bool(
53
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED
54
+ )
55
+
56
+ # Register an exit handler to warn about unprocessed metrics
57
+ atexit.register(self._warn_on_exit)
58
+
59
+ def post_metric_if_enabled(
60
+ self,
61
+ metric: Union[BaseMetric, BaseConversationalMetric],
62
+ test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
63
+ ):
64
+ """Post metric data asynchronously in a background thread."""
65
+ if not self.metric_logging_enabled or not is_confident():
66
+ return
67
+
68
+ from deepeval.evaluate.utils import create_metric_data
69
+
70
+ metric_data = create_metric_data(metric)
71
+ api_metric_data = ApiMetricData(
72
+ **metric_data.model_dump(by_alias=True, exclude_none=True)
73
+ )
74
+
75
+ if isinstance(test_case, LLMTestCase):
76
+ api_metric_data.llm_test_case = create_api_test_case(test_case)
77
+ elif isinstance(test_case, ConversationalTestCase):
78
+ api_metric_data.conversational_test_case = create_api_test_case(
79
+ test_case
80
+ )
81
+
82
+ self._ensure_worker_thread_running()
83
+ self._metric_queue.put(api_metric_data)
84
+
85
+ def _warn_on_exit(self):
86
+ """Warn if there are unprocessed metrics on exit."""
87
+ queue_size = self._metric_queue.qsize()
88
+ in_flight = len(self._in_flight_tasks)
89
+ remaining_tasks = queue_size + in_flight
90
+
91
+ if not self._flush_enabled and remaining_tasks > 0:
92
+ self._print_metric_data_status(
93
+ metric_worker_status=MetricWorkerStatus.WARNING,
94
+ message=f"Exiting with {queue_size + in_flight} abandoned metric(s).",
95
+ description=f"Set {CONFIDENT_METRIC_LOGGING_FLUSH}=1 as an environment variable to flush remaining metrics to Confident AI.",
96
+ )
97
+
98
+ def _ensure_worker_thread_running(self):
99
+ """Ensure the background worker thread is running."""
100
+ with self._thread_lock:
101
+ if (
102
+ self._worker_thread is None
103
+ or not self._worker_thread.is_alive()
104
+ ):
105
+ self._worker_thread = threading.Thread(
106
+ target=self._process_metric_queue,
107
+ daemon=self._daemon,
108
+ )
109
+ self._worker_thread.start()
110
+
111
+ def _print_metric_data_status(
112
+ self,
113
+ metric_worker_status: MetricWorkerStatus,
114
+ message: str,
115
+ description: Optional[str] = None,
116
+ ):
117
+ """Print metric data worker status messages."""
118
+ if getattr(get_settings(), CONFIDENT_METRIC_LOGGING_VERBOSE, False):
119
+ console = Console()
120
+ message_prefix = "[dim][Confident AI Metric Data Log][/dim]"
121
+ if metric_worker_status == MetricWorkerStatus.SUCCESS:
122
+ message = f"[green]{message}[/green]"
123
+ elif metric_worker_status == MetricWorkerStatus.FAILURE:
124
+ message = f"[red]{message}[/red]"
125
+ elif metric_worker_status == MetricWorkerStatus.WARNING:
126
+ message = f"[yellow]{message}[/yellow]"
127
+
128
+ if bool(CONFIDENT_METRIC_LOGGING_VERBOSE):
129
+ if description:
130
+ message += f": {description}"
131
+
132
+ console.print(
133
+ message_prefix,
134
+ message,
135
+ f"\nTo disable dev logging, set {CONFIDENT_METRIC_LOGGING_VERBOSE}=0 as an environment variable.",
136
+ )
137
+
138
+ def _process_metric_queue(self):
139
+ """Worker thread function that processes the metric queue."""
140
+ import threading
141
+
142
+ main_thr = threading.main_thread()
143
+
144
+ # Create a new event loop
145
+ loop = asyncio.new_event_loop()
146
+ asyncio.set_event_loop(loop)
147
+
148
+ # Buffer for payloads that need to be sent after main exits
149
+ remaining_metric_request_bodies: List[Dict[str, Any]] = []
150
+
151
+ async def _a_send_metric(metric_data: ApiMetricData):
152
+ nonlocal remaining_metric_request_bodies
153
+ try:
154
+ # Build API object & payload
155
+ try:
156
+ body = metric_data.model_dump(
157
+ by_alias=True,
158
+ exclude_none=True,
159
+ )
160
+ except AttributeError:
161
+ # Pydantic version below 2.0
162
+ body = metric_data.dict(by_alias=True, exclude_none=True)
163
+
164
+ # If the main thread is still alive, send now
165
+ if main_thr.is_alive():
166
+ api = Api()
167
+ _, _ = await api.a_send_request(
168
+ method=HttpMethods.POST,
169
+ endpoint=Endpoints.METRIC_DATA_ENDPOINT,
170
+ body=body,
171
+ )
172
+ queue_size = self._metric_queue.qsize()
173
+ in_flight = len(self._in_flight_tasks)
174
+ status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
175
+ self._print_metric_data_status(
176
+ metric_worker_status=MetricWorkerStatus.SUCCESS,
177
+ message=f"Successfully posted metric data {status}",
178
+ )
179
+ elif self._flush_enabled:
180
+ # Main thread gone → to be flushed
181
+ remaining_metric_request_bodies.append(body)
182
+
183
+ except Exception as e:
184
+ queue_size = self._metric_queue.qsize()
185
+ in_flight = len(self._in_flight_tasks)
186
+ status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
187
+ self._print_metric_data_status(
188
+ metric_worker_status=MetricWorkerStatus.FAILURE,
189
+ message=f"Error posting metric data {status}",
190
+ description=str(e),
191
+ )
192
+ finally:
193
+ task = asyncio.current_task()
194
+ if task:
195
+ self._in_flight_tasks.discard(task)
196
+
197
+ async def async_worker():
198
+ # Continue while user code is running or work remains
199
+ while (
200
+ main_thr.is_alive()
201
+ or not self._metric_queue.empty()
202
+ or self._in_flight_tasks
203
+ ):
204
+ try:
205
+ metric_data = self._metric_queue.get(
206
+ block=True, timeout=1.0
207
+ )
208
+
209
+ # Rate-limit
210
+ now = perf_counter()
211
+ elapsed = now - self._last_post_time
212
+ if elapsed < self._min_interval:
213
+ await asyncio.sleep(self._min_interval - elapsed)
214
+ self._last_post_time = perf_counter()
215
+
216
+ # Schedule async send
217
+ task = asyncio.create_task(_a_send_metric(metric_data))
218
+ self._in_flight_tasks.add(task)
219
+ self._metric_queue.task_done()
220
+
221
+ except queue.Empty:
222
+ await asyncio.sleep(0.1)
223
+ continue
224
+ except Exception as e:
225
+ self._print_metric_data_status(
226
+ message="Error in metric worker",
227
+ metric_worker_status=MetricWorkerStatus.FAILURE,
228
+ description=str(e),
229
+ )
230
+ await asyncio.sleep(1.0)
231
+
232
+ try:
233
+ loop.run_until_complete(async_worker())
234
+ finally:
235
+ # Drain any pending tasks
236
+ pending = asyncio.all_tasks(loop=loop)
237
+ if pending:
238
+ loop.run_until_complete(
239
+ asyncio.gather(*pending, return_exceptions=True)
240
+ )
241
+ self._flush_metrics(remaining_metric_request_bodies)
242
+ loop.run_until_complete(loop.shutdown_asyncgens())
243
+ loop.close()
244
+
245
+ def _flush_metrics(
246
+ self, remaining_metric_request_bodies: List[Dict[str, Any]]
247
+ ):
248
+ """Flush remaining metrics synchronously."""
249
+ if not remaining_metric_request_bodies:
250
+ return
251
+
252
+ self._print_metric_data_status(
253
+ MetricWorkerStatus.WARNING,
254
+ message=f"Flushing {len(remaining_metric_request_bodies)} remaining metric(s)",
255
+ )
256
+
257
+ for body in remaining_metric_request_bodies:
258
+ try:
259
+ api = Api()
260
+ _, link = api.send_request(
261
+ method=HttpMethods.POST,
262
+ endpoint=Endpoints.METRIC_DATA_ENDPOINT,
263
+ body=body,
264
+ )
265
+ qs = self._metric_queue.qsize()
266
+ self._print_metric_data_status(
267
+ metric_worker_status=MetricWorkerStatus.SUCCESS,
268
+ message=f"Successfully posted metric data ({qs} metrics remaining in queue, 1 in flight)",
269
+ description=link,
270
+ )
271
+ except Exception as e:
272
+ qs = self._metric_queue.qsize()
273
+ self._print_metric_data_status(
274
+ metric_worker_status=MetricWorkerStatus.FAILURE,
275
+ message="Error flushing remaining metric(s)",
276
+ description=str(e),
277
+ )
278
+
279
+
280
+ # Global metric manager instance
281
+ metric_data_manager = MetricDataManager()
@@ -19,6 +19,7 @@ from deepeval.metrics.argument_correctness.template import (
19
19
  )
20
20
  from deepeval.metrics.indicator import metric_progress_indicator
21
21
  from deepeval.metrics.argument_correctness.schema import *
22
+ from deepeval.metrics.api import metric_data_manager
22
23
 
23
24
 
24
25
  class ArgumentCorrectnessMetric(BaseMetric):
@@ -53,6 +54,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
60
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -68,6 +70,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
68
70
  test_case,
69
71
  _show_indicator=False,
70
72
  _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
71
74
  )
72
75
  )
73
76
  else:
@@ -91,7 +94,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
91
94
  f"Score: {self.score}\nReason: {self.reason}",
92
95
  ],
93
96
  )
94
-
97
+ if _log_metric_to_confident:
98
+ metric_data_manager.post_metric_if_enabled(
99
+ self, test_case=test_case
100
+ )
95
101
  return self.score
96
102
 
97
103
  async def a_measure(
@@ -99,6 +105,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
99
105
  test_case: LLMTestCase,
100
106
  _show_indicator: bool = True,
101
107
  _in_component: bool = False,
108
+ _log_metric_to_confident: bool = True,
102
109
  ) -> float:
103
110
 
104
111
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -130,7 +137,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
130
137
  f"Score: {self.score}\nReason: {self.reason}",
131
138
  ],
132
139
  )
133
-
140
+ if _log_metric_to_confident:
141
+ metric_data_manager.post_metric_if_enabled(
142
+ self, test_case=test_case
143
+ )
134
144
  return self.score
135
145
 
136
146
  async def _a_generate_reason(self, input: str) -> str:
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Type, Union
2
2
 
3
3
  from deepeval.metrics import BaseMetric
4
+ from deepeval.metrics.api import metric_data_manager
4
5
  from deepeval.test_case import (
5
6
  LLMTestCase,
6
7
  LLMTestCaseParams,
@@ -48,8 +49,8 @@ class BiasMetric(BaseMetric):
48
49
  test_case: LLMTestCase,
49
50
  _show_indicator: bool = True,
50
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
51
53
  ) -> float:
52
-
53
54
  check_llm_test_case_params(test_case, self._required_params, self)
54
55
 
55
56
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -63,6 +64,7 @@ class BiasMetric(BaseMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -81,7 +83,10 @@ class BiasMetric(BaseMetric):
81
83
  f"Score: {self.score}\nReason: {self.reason}",
82
84
  ],
83
85
  )
84
-
86
+ if _log_metric_to_confident:
87
+ metric_data_manager.post_metric_if_enabled(
88
+ self, test_case=test_case
89
+ )
85
90
  return self.score
86
91
 
87
92
  async def a_measure(
@@ -89,8 +94,8 @@ class BiasMetric(BaseMetric):
89
94
  test_case: LLMTestCase,
90
95
  _show_indicator: bool = True,
91
96
  _in_component: bool = False,
97
+ _log_metric_to_confident: bool = True,
92
98
  ) -> float:
93
-
94
99
  check_llm_test_case_params(test_case, self._required_params, self)
95
100
 
96
101
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -116,6 +121,10 @@ class BiasMetric(BaseMetric):
116
121
  ],
117
122
  )
118
123
 
124
+ if _log_metric_to_confident:
125
+ metric_data_manager.post_metric_if_enabled(
126
+ self, test_case=test_case
127
+ )
119
128
  return self.score
120
129
 
121
130
  async def _a_generate_reason(self) -> str:
@@ -18,6 +18,7 @@ from deepeval.metrics.contextual_precision.template import (
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
20
  from deepeval.metrics.contextual_precision.schema import *
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
 
23
24
  class ContextualPrecisionMetric(BaseMetric):
@@ -53,8 +54,8 @@ class ContextualPrecisionMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
-
58
59
  check_llm_test_case_params(test_case, self._required_params, self)
59
60
 
60
61
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -68,6 +69,7 @@ class ContextualPrecisionMetric(BaseMetric):
68
69
  test_case,
69
70
  _show_indicator=False,
70
71
  _in_component=_in_component,
72
+ _log_metric_to_confident=_log_metric_to_confident,
71
73
  )
72
74
  )
73
75
  else:
@@ -88,7 +90,10 @@ class ContextualPrecisionMetric(BaseMetric):
88
90
  f"Score: {self.score}\nReason: {self.reason}",
89
91
  ],
90
92
  )
91
-
93
+ if _log_metric_to_confident:
94
+ metric_data_manager.post_metric_if_enabled(
95
+ self, test_case=test_case
96
+ )
92
97
  return self.score
93
98
 
94
99
  async def a_measure(
@@ -96,6 +101,7 @@ class ContextualPrecisionMetric(BaseMetric):
96
101
  test_case: LLMTestCase,
97
102
  _show_indicator: bool = True,
98
103
  _in_component: bool = False,
104
+ _log_metric_to_confident: bool = True,
99
105
  ) -> float:
100
106
 
101
107
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -124,7 +130,10 @@ class ContextualPrecisionMetric(BaseMetric):
124
130
  f"Score: {self.score}\nReason: {self.reason}",
125
131
  ],
126
132
  )
127
-
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
128
137
  return self.score
129
138
 
130
139
  async def _a_generate_reason(self, input: str):