deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,19 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
+ from time import perf_counter
4
5
  from typing import Literal, Optional, List
5
6
 
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.confident.api import get_confident_api_key
9
+ from deepeval.metrics.base_metric import BaseMetric
8
10
  from deepeval.prompt import Prompt
9
11
  from deepeval.tracing.context import current_trace_context
10
12
  from deepeval.tracing.types import Trace
11
13
  from deepeval.tracing.otel.utils import to_hex_string
14
+ from deepeval.tracing.tracing import trace_manager
15
+ from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
16
+ from deepeval.tracing.otel.exporter import ConfidentSpanExporter
12
17
 
13
18
 
14
19
  logger = logging.getLogger(__name__)
@@ -21,6 +26,7 @@ try:
21
26
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
22
27
  OTLPSpanExporter,
23
28
  )
29
+ from opentelemetry.sdk.trace import ReadableSpan
24
30
 
25
31
  dependency_installed = True
26
32
  except ImportError as e:
@@ -48,24 +54,96 @@ def is_dependency_installed():
48
54
  return True
49
55
 
50
56
 
57
+ from deepeval.tracing.types import AgentSpan
51
58
  from deepeval.confident.api import get_confident_api_key
52
59
  from deepeval.prompt import Prompt
53
60
  from deepeval.tracing.otel.test_exporter import test_exporter
54
61
  from deepeval.tracing.context import current_trace_context
55
62
  from deepeval.tracing.types import Trace
56
63
  from deepeval.tracing.otel.utils import to_hex_string
64
+ from deepeval.tracing.types import TraceSpanStatus, ToolCall
65
+ from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
57
66
 
58
67
  # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
59
68
  OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
69
+ init_clock_bridge() # initialize clock bridge for perf_counter() to epoch_nanos conversion
70
+
71
+
72
+ class ConfidentInstrumentationSettings(InstrumentationSettings):
73
+
74
+ def __init__(
75
+ self,
76
+ api_key: Optional[str] = None,
77
+ name: Optional[str] = None,
78
+ thread_id: Optional[str] = None,
79
+ user_id: Optional[str] = None,
80
+ metadata: Optional[dict] = None,
81
+ tags: Optional[List[str]] = None,
82
+ metric_collection: Optional[str] = None,
83
+ confident_prompt: Optional[Prompt] = None,
84
+ llm_metric_collection: Optional[str] = None,
85
+ agent_metric_collection: Optional[str] = None,
86
+ tool_metric_collection_map: Optional[dict] = None,
87
+ trace_metric_collection: Optional[str] = None,
88
+ is_test_mode: Optional[bool] = False,
89
+ agent_metrics: Optional[List[BaseMetric]] = None,
90
+ ):
91
+ is_dependency_installed()
92
+
93
+ _environment = os.getenv("CONFIDENT_TRACE_ENVIRONMENT", "development")
94
+ if _environment and _environment in [
95
+ "production",
96
+ "staging",
97
+ "development",
98
+ "testing",
99
+ ]:
100
+ self.environment = _environment
101
+
102
+ self.tool_metric_collection_map = tool_metric_collection_map or {}
103
+ self.name = name
104
+ self.thread_id = thread_id
105
+ self.user_id = user_id
106
+ self.metadata = metadata
107
+ self.tags = tags
108
+ self.metric_collection = metric_collection
109
+ self.confident_prompt = confident_prompt
110
+ self.llm_metric_collection = llm_metric_collection
111
+ self.agent_metric_collection = agent_metric_collection
112
+ self.trace_metric_collection = trace_metric_collection
113
+ self.is_test_mode = is_test_mode
114
+ self.agent_metrics = agent_metrics
115
+
116
+ if not api_key:
117
+ api_key = get_confident_api_key()
118
+ if not api_key:
119
+ raise ValueError("CONFIDENT_API_KEY is not set")
120
+
121
+ trace_provider = TracerProvider()
122
+
123
+ # Pass the entire settings instance instead of individual values
124
+ span_interceptor = SpanInterceptor(self)
125
+ trace_provider.add_span_processor(span_interceptor)
126
+
127
+ if is_test_mode:
128
+ trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
129
+ else:
130
+ trace_provider.add_span_processor(
131
+ BatchSpanProcessor(
132
+ OTLPSpanExporter(
133
+ endpoint=OTLP_ENDPOINT,
134
+ headers={"x-confident-api-key": api_key},
135
+ )
136
+ )
137
+ )
138
+ super().__init__(tracer_provider=trace_provider)
60
139
 
61
140
 
62
141
  class SpanInterceptor(SpanProcessor):
63
- def __init__(self, settings_instance):
142
+ def __init__(self, settings_instance: ConfidentInstrumentationSettings):
64
143
  # Keep a reference to the settings instance instead of copying values
65
- self.settings: ConfidentInstrumentationSettings = settings_instance
144
+ self.settings = settings_instance
66
145
 
67
146
  def on_start(self, span, parent_context):
68
-
69
147
  # set trace uuid
70
148
  _current_trace_context = current_trace_context.get()
71
149
  if _current_trace_context and isinstance(_current_trace_context, Trace):
@@ -151,85 +229,56 @@ class SpanInterceptor(SpanProcessor):
151
229
  )
152
230
 
153
231
  def on_end(self, span):
154
- pass
155
-
232
+ if self.settings.is_test_mode:
233
+ if span.attributes.get("confident.span.type") == "agent":
156
234
 
157
- class ConfidentInstrumentationSettings(InstrumentationSettings):
158
-
159
- name: Optional[str] = None
160
- thread_id: Optional[str] = None
161
- user_id: Optional[str] = None
162
- metadata: Optional[dict] = None
163
- tags: Optional[List[str]] = None
164
- environment: Literal["production", "staging", "development", "testing"] = (
165
- None
166
- )
167
- metric_collection: Optional[str] = None
168
- confident_prompt: Optional[Prompt] = None
169
- llm_metric_collection: Optional[str] = None
170
- agent_metric_collection: Optional[str] = None
171
- tool_metric_collection_map: dict = {}
172
- trace_metric_collection: Optional[str] = None
235
+ def create_agent_span_for_evaluation(
236
+ span: ReadableSpan,
237
+ ) -> AgentSpan:
173
238
 
174
- def __init__(
175
- self,
176
- api_key: Optional[str] = None,
177
- name: Optional[str] = None,
178
- thread_id: Optional[str] = None,
179
- user_id: Optional[str] = None,
180
- metadata: Optional[dict] = None,
181
- tags: Optional[List[str]] = None,
182
- metric_collection: Optional[str] = None,
183
- confident_prompt: Optional[Prompt] = None,
184
- llm_metric_collection: Optional[str] = None,
185
- agent_metric_collection: Optional[str] = None,
186
- tool_metric_collection_map: Optional[dict] = None,
187
- trace_metric_collection: Optional[str] = None,
188
- is_test_mode: Optional[bool] = False,
189
- ):
190
- is_dependency_installed()
239
+ agent_span = (
240
+ ConfidentSpanExporter.prepare_boilerplate_base_span(
241
+ span
242
+ )
243
+ )
191
244
 
192
- _environment = os.getenv("CONFIDENT_TRACE_ENVIRONMENT", "development")
193
- if _environment and _environment in [
194
- "production",
195
- "staging",
196
- "development",
197
- "testing",
198
- ]:
199
- self.environment = _environment
245
+ # tools called
246
+ normalized_messages = normalize_pydantic_ai_messages(span)
247
+ tools_called = []
200
248
 
201
- self.tool_metric_collection_map = tool_metric_collection_map or {}
202
- self.name = name
203
- self.thread_id = thread_id
204
- self.user_id = user_id
205
- self.metadata = metadata
206
- self.tags = tags
207
- self.metric_collection = metric_collection
208
- self.confident_prompt = confident_prompt
209
- self.llm_metric_collection = llm_metric_collection
210
- self.agent_metric_collection = agent_metric_collection
211
- self.trace_metric_collection = trace_metric_collection
249
+ for message in normalized_messages:
250
+ for part in message.get("parts", []):
251
+ if part.get("type") == "tool_call":
252
+ name = part.get("name")
253
+ try:
254
+ input_parameters = json.loads(
255
+ part.get("arguments")
256
+ )
257
+ except Exception:
258
+ input_parameters = {}
212
259
 
213
- if not api_key:
214
- api_key = get_confident_api_key()
215
- if not api_key:
216
- raise ValueError("CONFIDENT_API_KEY is not set")
260
+ tools_called.append(
261
+ ToolCall(
262
+ name=name,
263
+ input_parameters=input_parameters,
264
+ )
265
+ )
217
266
 
218
- trace_provider = TracerProvider()
267
+ # agent_span.tools_called = tools_called
268
+ return agent_span
219
269
 
220
- # Pass the entire settings instance instead of individual values
221
- span_interceptor = SpanInterceptor(self)
222
- trace_provider.add_span_processor(span_interceptor)
270
+ agent_span = create_agent_span_for_evaluation(span)
271
+ agent_span.metrics = self.settings.agent_metrics
223
272
 
224
- if is_test_mode:
225
- trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
226
- else:
227
- trace_provider.add_span_processor(
228
- BatchSpanProcessor(
229
- OTLPSpanExporter(
230
- endpoint=OTLP_ENDPOINT,
231
- headers={"x-confident-api-key": api_key},
273
+ # create a trace for evaluation
274
+ trace = trace_manager.get_trace_by_uuid(agent_span.trace_uuid)
275
+ if not trace:
276
+ trace = trace_manager.start_new_trace(
277
+ trace_uuid=agent_span.trace_uuid
232
278
  )
233
- )
234
- )
235
- super().__init__(tracer_provider=trace_provider)
279
+
280
+ trace.root_spans.append(agent_span)
281
+ trace.status = TraceSpanStatus.SUCCESS
282
+ trace.end_time = perf_counter()
283
+ trace_manager.traces_to_evaluate.append(trace)
284
+ test_exporter.clear_span_json_list()
@@ -27,6 +27,12 @@ from .tool_correctness.tool_correctness import ToolCorrectnessMetric
27
27
  from .json_correctness.json_correctness import JsonCorrectnessMetric
28
28
  from .prompt_alignment.prompt_alignment import PromptAlignmentMetric
29
29
  from .task_completion.task_completion import TaskCompletionMetric
30
+ from .topic_adherence.topic_adherence import TopicAdherenceMetric
31
+ from .step_efficiency.step_efficiency import StepEfficiencyMetric
32
+ from .plan_adherence.plan_adherence import PlanAdherenceMetric
33
+ from .plan_quality.plan_quality import PlanQualityMetric
34
+ from .tool_use.tool_use import ToolUseMetric
35
+ from .goal_accuracy.goal_accuracy import GoalAccuracyMetric
30
36
  from .argument_correctness.argument_correctness import ArgumentCorrectnessMetric
31
37
  from .mcp.mcp_task_completion import MCPTaskCompletionMetric
32
38
  from .mcp.multi_turn_mcp_use_metric import MultiTurnMCPUseMetric
@@ -98,6 +104,13 @@ __all__ = [
98
104
  "TaskCompletionMetric",
99
105
  "ArgumentCorrectnessMetric",
100
106
  "KnowledgeRetentionMetric",
107
+ # Agentic metrics
108
+ "TopicAdherenceMetric",
109
+ "StepEfficiencyMetric",
110
+ "PlanAdherenceMetric",
111
+ "PlanQualityMetric",
112
+ "ToolUseMetric",
113
+ "GoalAccuracyMetric",
101
114
  # Conversational metrics
102
115
  "TurnRelevancyMetric",
103
116
  "ConversationCompletenessMetric",
@@ -16,6 +16,7 @@ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
17
17
  from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.answer_relevancy.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class AnswerRelevancyMetric(BaseMetric):
@@ -50,8 +51,8 @@ class AnswerRelevancyMetric(BaseMetric):
50
51
  test_case: LLMTestCase,
51
52
  _show_indicator: bool = True,
52
53
  _in_component: bool = False,
54
+ _log_metric_to_confident: bool = True,
53
55
  ) -> float:
54
-
55
56
  check_llm_test_case_params(test_case, self._required_params, self)
56
57
 
57
58
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -65,6 +66,7 @@ class AnswerRelevancyMetric(BaseMetric):
65
66
  test_case,
66
67
  _show_indicator=False,
67
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
68
70
  )
69
71
  )
70
72
  else:
@@ -85,6 +87,10 @@ class AnswerRelevancyMetric(BaseMetric):
85
87
  f"Score: {self.score}\nReason: {self.reason}",
86
88
  ],
87
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
88
94
 
89
95
  return self.score
90
96
 
@@ -93,8 +99,8 @@ class AnswerRelevancyMetric(BaseMetric):
93
99
  test_case: LLMTestCase,
94
100
  _show_indicator: bool = True,
95
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
96
103
  ) -> float:
97
-
98
104
  check_llm_test_case_params(test_case, self._required_params, self)
99
105
 
100
106
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -121,7 +127,10 @@ class AnswerRelevancyMetric(BaseMetric):
121
127
  f"Score: {self.score}\nReason: {self.reason}",
122
128
  ],
123
129
  )
124
-
130
+ if _log_metric_to_confident:
131
+ metric_data_manager.post_metric_if_enabled(
132
+ self, test_case=test_case
133
+ )
125
134
  return self.score
126
135
 
127
136
  async def _a_generate_reason(self, input: str) -> str:
@@ -0,0 +1,281 @@
1
+ from typing import Optional, Set, Any, Dict, List, Union
2
+ import threading
3
+ import asyncio
4
+ import queue
5
+ import atexit
6
+ from time import perf_counter
7
+ from enum import Enum
8
+ from pydantic import Field
9
+ from rich.console import Console
10
+
11
+ from deepeval.confident.api import Api, HttpMethods, Endpoints, is_confident
12
+ from deepeval.constants import (
13
+ CONFIDENT_METRIC_LOGGING_FLUSH,
14
+ CONFIDENT_METRIC_LOGGING_VERBOSE,
15
+ )
16
+ from deepeval.metrics.base_metric import BaseConversationalMetric, BaseMetric
17
+ from deepeval.test_case.conversational_test_case import ConversationalTestCase
18
+ from deepeval.test_case.llm_test_case import LLMTestCase
19
+ from deepeval.test_case.api import create_api_test_case
20
+ from deepeval.test_run.api import LLMApiTestCase, ConversationalApiTestCase
21
+ from deepeval.tracing.api import MetricData
22
+ from deepeval.config.settings import get_settings
23
+
24
+
25
+ class MetricWorkerStatus(Enum):
26
+ SUCCESS = "success"
27
+ FAILURE = "failure"
28
+ WARNING = "warning"
29
+
30
+
31
+ class ApiMetricData(MetricData):
32
+ llm_test_case: Optional[LLMApiTestCase] = Field(None, alias="llmTestCase")
33
+ conversational_test_case: Optional[ConversationalApiTestCase] = Field(
34
+ None, alias="conversationalTestCase"
35
+ )
36
+
37
+
38
+ class MetricDataManager:
39
+ """Manager for posting metric data asynchronously in background thread."""
40
+
41
+ def __init__(self):
42
+ settings = get_settings()
43
+ # Initialize queue and worker thread for metric posting
44
+ self._metric_queue = queue.Queue()
45
+ self._worker_thread = None
46
+ self._min_interval = 0.2 # Minimum time between API calls (seconds)
47
+ self._last_post_time = 0
48
+ self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
49
+ self._flush_enabled = bool(settings.CONFIDENT_METRIC_LOGGING_FLUSH)
50
+ self._daemon = not self._flush_enabled
51
+ self._thread_lock = threading.Lock()
52
+ self.metric_logging_enabled = bool(
53
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED
54
+ )
55
+
56
+ # Register an exit handler to warn about unprocessed metrics
57
+ atexit.register(self._warn_on_exit)
58
+
59
+ def post_metric_if_enabled(
60
+ self,
61
+ metric: Union[BaseMetric, BaseConversationalMetric],
62
+ test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
63
+ ):
64
+ """Post metric data asynchronously in a background thread."""
65
+ if not self.metric_logging_enabled or not is_confident():
66
+ return
67
+
68
+ from deepeval.evaluate.utils import create_metric_data
69
+
70
+ metric_data = create_metric_data(metric)
71
+ api_metric_data = ApiMetricData(
72
+ **metric_data.model_dump(by_alias=True, exclude_none=True)
73
+ )
74
+
75
+ if isinstance(test_case, LLMTestCase):
76
+ api_metric_data.llm_test_case = create_api_test_case(test_case)
77
+ elif isinstance(test_case, ConversationalTestCase):
78
+ api_metric_data.conversational_test_case = create_api_test_case(
79
+ test_case
80
+ )
81
+
82
+ self._ensure_worker_thread_running()
83
+ self._metric_queue.put(api_metric_data)
84
+
85
+ def _warn_on_exit(self):
86
+ """Warn if there are unprocessed metrics on exit."""
87
+ queue_size = self._metric_queue.qsize()
88
+ in_flight = len(self._in_flight_tasks)
89
+ remaining_tasks = queue_size + in_flight
90
+
91
+ if not self._flush_enabled and remaining_tasks > 0:
92
+ self._print_metric_data_status(
93
+ metric_worker_status=MetricWorkerStatus.WARNING,
94
+ message=f"Exiting with {queue_size + in_flight} abandoned metric(s).",
95
+ description=f"Set {CONFIDENT_METRIC_LOGGING_FLUSH}=1 as an environment variable to flush remaining metrics to Confident AI.",
96
+ )
97
+
98
+ def _ensure_worker_thread_running(self):
99
+ """Ensure the background worker thread is running."""
100
+ with self._thread_lock:
101
+ if (
102
+ self._worker_thread is None
103
+ or not self._worker_thread.is_alive()
104
+ ):
105
+ self._worker_thread = threading.Thread(
106
+ target=self._process_metric_queue,
107
+ daemon=self._daemon,
108
+ )
109
+ self._worker_thread.start()
110
+
111
+ def _print_metric_data_status(
112
+ self,
113
+ metric_worker_status: MetricWorkerStatus,
114
+ message: str,
115
+ description: Optional[str] = None,
116
+ ):
117
+ """Print metric data worker status messages."""
118
+ if getattr(get_settings(), CONFIDENT_METRIC_LOGGING_VERBOSE, False):
119
+ console = Console()
120
+ message_prefix = "[dim][Confident AI Metric Data Log][/dim]"
121
+ if metric_worker_status == MetricWorkerStatus.SUCCESS:
122
+ message = f"[green]{message}[/green]"
123
+ elif metric_worker_status == MetricWorkerStatus.FAILURE:
124
+ message = f"[red]{message}[/red]"
125
+ elif metric_worker_status == MetricWorkerStatus.WARNING:
126
+ message = f"[yellow]{message}[/yellow]"
127
+
128
+ if bool(CONFIDENT_METRIC_LOGGING_VERBOSE):
129
+ if description:
130
+ message += f": {description}"
131
+
132
+ console.print(
133
+ message_prefix,
134
+ message,
135
+ f"\nTo disable dev logging, set {CONFIDENT_METRIC_LOGGING_VERBOSE}=0 as an environment variable.",
136
+ )
137
+
138
+ def _process_metric_queue(self):
139
+ """Worker thread function that processes the metric queue."""
140
+ import threading
141
+
142
+ main_thr = threading.main_thread()
143
+
144
+ # Create a new event loop
145
+ loop = asyncio.new_event_loop()
146
+ asyncio.set_event_loop(loop)
147
+
148
+ # Buffer for payloads that need to be sent after main exits
149
+ remaining_metric_request_bodies: List[Dict[str, Any]] = []
150
+
151
+ async def _a_send_metric(metric_data: ApiMetricData):
152
+ nonlocal remaining_metric_request_bodies
153
+ try:
154
+ # Build API object & payload
155
+ try:
156
+ body = metric_data.model_dump(
157
+ by_alias=True,
158
+ exclude_none=True,
159
+ )
160
+ except AttributeError:
161
+ # Pydantic version below 2.0
162
+ body = metric_data.dict(by_alias=True, exclude_none=True)
163
+
164
+ # If the main thread is still alive, send now
165
+ if main_thr.is_alive():
166
+ api = Api()
167
+ _, _ = await api.a_send_request(
168
+ method=HttpMethods.POST,
169
+ endpoint=Endpoints.METRIC_DATA_ENDPOINT,
170
+ body=body,
171
+ )
172
+ queue_size = self._metric_queue.qsize()
173
+ in_flight = len(self._in_flight_tasks)
174
+ status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
175
+ self._print_metric_data_status(
176
+ metric_worker_status=MetricWorkerStatus.SUCCESS,
177
+ message=f"Successfully posted metric data {status}",
178
+ )
179
+ elif self._flush_enabled:
180
+ # Main thread gone → to be flushed
181
+ remaining_metric_request_bodies.append(body)
182
+
183
+ except Exception as e:
184
+ queue_size = self._metric_queue.qsize()
185
+ in_flight = len(self._in_flight_tasks)
186
+ status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
187
+ self._print_metric_data_status(
188
+ metric_worker_status=MetricWorkerStatus.FAILURE,
189
+ message=f"Error posting metric data {status}",
190
+ description=str(e),
191
+ )
192
+ finally:
193
+ task = asyncio.current_task()
194
+ if task:
195
+ self._in_flight_tasks.discard(task)
196
+
197
+ async def async_worker():
198
+ # Continue while user code is running or work remains
199
+ while (
200
+ main_thr.is_alive()
201
+ or not self._metric_queue.empty()
202
+ or self._in_flight_tasks
203
+ ):
204
+ try:
205
+ metric_data = self._metric_queue.get(
206
+ block=True, timeout=1.0
207
+ )
208
+
209
+ # Rate-limit
210
+ now = perf_counter()
211
+ elapsed = now - self._last_post_time
212
+ if elapsed < self._min_interval:
213
+ await asyncio.sleep(self._min_interval - elapsed)
214
+ self._last_post_time = perf_counter()
215
+
216
+ # Schedule async send
217
+ task = asyncio.create_task(_a_send_metric(metric_data))
218
+ self._in_flight_tasks.add(task)
219
+ self._metric_queue.task_done()
220
+
221
+ except queue.Empty:
222
+ await asyncio.sleep(0.1)
223
+ continue
224
+ except Exception as e:
225
+ self._print_metric_data_status(
226
+ message="Error in metric worker",
227
+ metric_worker_status=MetricWorkerStatus.FAILURE,
228
+ description=str(e),
229
+ )
230
+ await asyncio.sleep(1.0)
231
+
232
+ try:
233
+ loop.run_until_complete(async_worker())
234
+ finally:
235
+ # Drain any pending tasks
236
+ pending = asyncio.all_tasks(loop=loop)
237
+ if pending:
238
+ loop.run_until_complete(
239
+ asyncio.gather(*pending, return_exceptions=True)
240
+ )
241
+ self._flush_metrics(remaining_metric_request_bodies)
242
+ loop.run_until_complete(loop.shutdown_asyncgens())
243
+ loop.close()
244
+
245
+ def _flush_metrics(
246
+ self, remaining_metric_request_bodies: List[Dict[str, Any]]
247
+ ):
248
+ """Flush remaining metrics synchronously."""
249
+ if not remaining_metric_request_bodies:
250
+ return
251
+
252
+ self._print_metric_data_status(
253
+ MetricWorkerStatus.WARNING,
254
+ message=f"Flushing {len(remaining_metric_request_bodies)} remaining metric(s)",
255
+ )
256
+
257
+ for body in remaining_metric_request_bodies:
258
+ try:
259
+ api = Api()
260
+ _, link = api.send_request(
261
+ method=HttpMethods.POST,
262
+ endpoint=Endpoints.METRIC_DATA_ENDPOINT,
263
+ body=body,
264
+ )
265
+ qs = self._metric_queue.qsize()
266
+ self._print_metric_data_status(
267
+ metric_worker_status=MetricWorkerStatus.SUCCESS,
268
+ message=f"Successfully posted metric data ({qs} metrics remaining in queue, 1 in flight)",
269
+ description=link,
270
+ )
271
+ except Exception as e:
272
+ qs = self._metric_queue.qsize()
273
+ self._print_metric_data_status(
274
+ metric_worker_status=MetricWorkerStatus.FAILURE,
275
+ message="Error flushing remaining metric(s)",
276
+ description=str(e),
277
+ )
278
+
279
+
280
+ # Global metric manager instance
281
+ metric_data_manager = MetricDataManager()
@@ -19,6 +19,7 @@ from deepeval.metrics.argument_correctness.template import (
19
19
  )
20
20
  from deepeval.metrics.indicator import metric_progress_indicator
21
21
  from deepeval.metrics.argument_correctness.schema import *
22
+ from deepeval.metrics.api import metric_data_manager
22
23
 
23
24
 
24
25
  class ArgumentCorrectnessMetric(BaseMetric):
@@ -53,6 +54,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
60
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -68,6 +70,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
68
70
  test_case,
69
71
  _show_indicator=False,
70
72
  _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
71
74
  )
72
75
  )
73
76
  else:
@@ -91,7 +94,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
91
94
  f"Score: {self.score}\nReason: {self.reason}",
92
95
  ],
93
96
  )
94
-
97
+ if _log_metric_to_confident:
98
+ metric_data_manager.post_metric_if_enabled(
99
+ self, test_case=test_case
100
+ )
95
101
  return self.score
96
102
 
97
103
  async def a_measure(
@@ -99,6 +105,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
99
105
  test_case: LLMTestCase,
100
106
  _show_indicator: bool = True,
101
107
  _in_component: bool = False,
108
+ _log_metric_to_confident: bool = True,
102
109
  ) -> float:
103
110
 
104
111
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -130,7 +137,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
130
137
  f"Score: {self.score}\nReason: {self.reason}",
131
138
  ],
132
139
  )
133
-
140
+ if _log_metric_to_confident:
141
+ metric_data_manager.post_metric_if_enabled(
142
+ self, test_case=test_case
143
+ )
134
144
  return self.score
135
145
 
136
146
  async def _a_generate_reason(self, input: str) -> str:
@@ -27,6 +27,7 @@ class BaseMetric:
27
27
  evaluation_cost: Optional[float] = None
28
28
  verbose_logs: Optional[str] = None
29
29
  skipped = False
30
+ requires_trace: bool = False
30
31
  model = Optional[DeepEvalBaseLLM]
31
32
  using_native_model = Optional[bool]
32
33