deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -10,6 +10,7 @@ from deepeval.test_case import MLLMImage
10
10
  class TestResult:
11
11
  """Returned from run_test"""
12
12
 
13
+ __test__ = False
13
14
  name: str
14
15
  success: bool
15
16
  metrics_data: Union[List[MetricData], None]
@@ -5,8 +5,6 @@ import os
5
5
  import time
6
6
 
7
7
  from deepeval.utils import format_turn
8
- from deepeval.test_case.conversational_test_case import Turn
9
- from deepeval.test_run.api import TurnApi
10
8
  from deepeval.test_run.test_run import TestRunResultDisplay
11
9
  from deepeval.dataset import Golden
12
10
  from deepeval.metrics import (
@@ -481,6 +479,18 @@ def count_metrics_in_trace(trace: Trace) -> int:
481
479
  return sum(count_metrics_recursive(span) for span in trace.root_spans)
482
480
 
483
481
 
482
+ def count_total_metrics_for_trace(trace: Trace) -> int:
483
+ """Span subtree metrics + trace-level metrics."""
484
+ return count_metrics_in_trace(trace=trace) + len(trace.metrics or [])
485
+
486
+
487
+ def count_metrics_in_span_subtree(span: BaseSpan) -> int:
488
+ total = len(span.metrics or [])
489
+ for c in span.children or []:
490
+ total += count_metrics_in_span_subtree(c)
491
+ return total
492
+
493
+
484
494
  def extract_trace_test_results(trace_api: TraceApi) -> List[TestResult]:
485
495
  test_results: List[TestResult] = []
486
496
  # extract trace result
@@ -523,7 +533,7 @@ def extract_span_test_results(span_api: BaseApiSpan) -> List[TestResult]:
523
533
  test_results.append(
524
534
  TestResult(
525
535
  name=span_api.name,
526
- success=span_api.status == "SUCCESS",
536
+ success=span_api.status == TraceSpanApiStatus.SUCCESS,
527
537
  metrics_data=span_api.metrics_data,
528
538
  input=span_api.input,
529
539
  actual_output=span_api.output,
@@ -4,5 +4,6 @@ from .subs import (
4
4
  DeepEvalAgent as Agent,
5
5
  DeepEvalLLM as LLM,
6
6
  )
7
+ from .tool import tool
7
8
 
8
- __all__ = ["instrument_crewai", "Crew", "Agent", "LLM"]
9
+ __all__ = ["instrument_crewai", "Crew", "Agent", "LLM", "tool"]
@@ -0,0 +1,71 @@
1
+ import functools
2
+ from typing import Callable
3
+ from crewai.tools import tool as crewai_tool
4
+
5
+ from deepeval.tracing.context import current_span_context
6
+ from deepeval.tracing.types import ToolSpan
7
+
8
+
9
+ def tool(*args, metric=None, metric_collection=None, **kwargs) -> Callable:
10
+ """
11
+ Simple wrapper around crewai.tools.tool that:
12
+ - prints the original function's input and output
13
+ - accepts additional parameters: metric and metric_collection (unused, for compatibility)
14
+ - remains backward compatible with CrewAI's decorator usage patterns
15
+ """
16
+ crewai_kwargs = kwargs
17
+
18
+ # Case 1: @tool (function passed directly)
19
+ if len(args) == 1 and callable(args[0]):
20
+ f = args[0]
21
+ tool_name = f.__name__
22
+
23
+ @functools.wraps(f)
24
+ def wrapped(*f_args, **f_kwargs):
25
+ current_span = current_span_context.get()
26
+ if current_span and isinstance(current_span, ToolSpan):
27
+ current_span.metric_collection = metric_collection
28
+ current_span.metrics = metric
29
+ result = f(*f_args, **f_kwargs)
30
+ return result
31
+
32
+ return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
33
+
34
+ # Case 2: @tool("name")
35
+ if len(args) == 1 and isinstance(args[0], str):
36
+ tool_name = args[0]
37
+
38
+ def _decorator(f: Callable) -> Callable:
39
+ @functools.wraps(f)
40
+ def wrapped(*f_args, **f_kwargs):
41
+ current_span = current_span_context.get()
42
+ if current_span and isinstance(current_span, ToolSpan):
43
+ current_span.metric_collection = metric_collection
44
+ current_span.metrics = metric
45
+ result = f(*f_args, **f_kwargs)
46
+ return result
47
+
48
+ return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
49
+
50
+ return _decorator
51
+
52
+ # Case 3: @tool(result_as_answer=True, ...) — kwargs only
53
+ if len(args) == 0:
54
+
55
+ def _decorator(f: Callable) -> Callable:
56
+ tool_name = f.__name__
57
+
58
+ @functools.wraps(f)
59
+ def wrapped(*f_args, **f_kwargs):
60
+ current_span = current_span_context.get()
61
+ if current_span and isinstance(current_span, ToolSpan):
62
+ current_span.metric_collection = metric_collection
63
+ current_span.metrics = metric
64
+ result = f(*f_args, **f_kwargs)
65
+ return result
66
+
67
+ return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
68
+
69
+ return _decorator
70
+
71
+ raise ValueError("Invalid arguments")
@@ -1,10 +1,6 @@
1
1
  from .handler import instrument_llama_index
2
- from .agent.patched import FunctionAgent, ReActAgent, CodeActAgent
3
2
 
4
3
 
5
4
  __all__ = [
6
5
  "instrument_llama_index",
7
- "FunctionAgent",
8
- "ReActAgent",
9
- "CodeActAgent",
10
6
  ]
@@ -5,6 +5,10 @@ import uuid
5
5
  from deepeval.telemetry import capture_tracing_integration
6
6
  from deepeval.tracing import trace_manager
7
7
  from deepeval.tracing.types import AgentSpan, BaseSpan, LlmSpan, TraceSpanStatus
8
+ from deepeval.tracing.trace_context import (
9
+ current_llm_context,
10
+ current_agent_context,
11
+ )
8
12
 
9
13
  try:
10
14
  from llama_index.core.instrumentation.events.base import BaseEvent
@@ -22,11 +26,6 @@ try:
22
26
  LLMChatEndEvent,
23
27
  )
24
28
  from llama_index_instrumentation.dispatcher import Dispatcher
25
- from deepeval.integrations.llama_index.agent.patched import (
26
- FunctionAgent as PatchedFunctionAgent,
27
- ReActAgent as PatchedReActAgent,
28
- CodeActAgent as PatchedCodeActAgent,
29
- )
30
29
  from deepeval.integrations.llama_index.utils import (
31
30
  parse_id,
32
31
  prepare_input_llm_test_case_params,
@@ -67,6 +66,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
67
66
  ).strip()
68
67
  input_messages.append({"role": role, "content": content})
69
68
 
69
+ llm_span_context = current_llm_context.get()
70
70
  # create the span
71
71
  llm_span = LlmSpan(
72
72
  name="ConfidentLLMSpan",
@@ -83,6 +83,12 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
83
83
  ), # check the model name not coming in this option
84
84
  input=input_messages,
85
85
  output="",
86
+ metrics=llm_span_context.metrics if llm_span_context else None,
87
+ metric_collection=(
88
+ llm_span_context.metric_collection
89
+ if llm_span_context
90
+ else None
91
+ ),
86
92
  )
87
93
  trace_manager.add_span(llm_span)
88
94
  trace_manager.add_span_to_trace(llm_span)
@@ -144,6 +150,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
144
150
 
145
151
  # conditions to qualify as agent start run span
146
152
  if method_name == "run":
153
+ agent_span_context = current_agent_context.get()
147
154
  span = AgentSpan(
148
155
  uuid=id_,
149
156
  status=TraceSpanStatus.IN_PROGRESS,
@@ -153,24 +160,16 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
153
160
  start_time=perf_counter(),
154
161
  name="Agent", # TODO: decide the name of the span
155
162
  input=bound_args.arguments,
163
+ metrics=(
164
+ agent_span_context.metrics if agent_span_context else None
165
+ ),
166
+ metric_collection=(
167
+ agent_span_context.metric_collection
168
+ if agent_span_context
169
+ else None
170
+ ),
156
171
  )
157
172
 
158
- # check if the instance is a PatchedFunctionAgent
159
- if isinstance(instance, PatchedFunctionAgent):
160
- span.name = "FunctionAgent"
161
- span.metric_collection = instance.metric_collection
162
- span.metrics = instance.metrics
163
-
164
- if isinstance(instance, PatchedReActAgent):
165
- span.name = "ReActAgent"
166
- span.metric_collection = instance.metric_collection
167
- span.metrics = instance.metrics
168
-
169
- if isinstance(instance, PatchedCodeActAgent):
170
- span.name = "CodeActAgent"
171
- span.metric_collection = instance.metric_collection
172
- span.metrics = instance.metrics
173
-
174
173
  # prepare input test case params for the span
175
174
  prepare_input_llm_test_case_params(
176
175
  class_name, method_name, span, bound_args.arguments
@@ -1,14 +1,19 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
+ from time import perf_counter
4
5
  from typing import Literal, Optional, List
5
6
 
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.confident.api import get_confident_api_key
9
+ from deepeval.metrics.base_metric import BaseMetric
8
10
  from deepeval.prompt import Prompt
9
11
  from deepeval.tracing.context import current_trace_context
10
12
  from deepeval.tracing.types import Trace
11
13
  from deepeval.tracing.otel.utils import to_hex_string
14
+ from deepeval.tracing.tracing import trace_manager
15
+ from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
16
+ from deepeval.tracing.otel.exporter import ConfidentSpanExporter
12
17
 
13
18
 
14
19
  logger = logging.getLogger(__name__)
@@ -21,6 +26,7 @@ try:
21
26
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
22
27
  OTLPSpanExporter,
23
28
  )
29
+ from opentelemetry.sdk.trace import ReadableSpan
24
30
 
25
31
  dependency_installed = True
26
32
  except ImportError as e:
@@ -48,24 +54,96 @@ def is_dependency_installed():
48
54
  return True
49
55
 
50
56
 
57
+ from deepeval.tracing.types import AgentSpan
51
58
  from deepeval.confident.api import get_confident_api_key
52
59
  from deepeval.prompt import Prompt
53
60
  from deepeval.tracing.otel.test_exporter import test_exporter
54
61
  from deepeval.tracing.context import current_trace_context
55
62
  from deepeval.tracing.types import Trace
56
63
  from deepeval.tracing.otel.utils import to_hex_string
64
+ from deepeval.tracing.types import TraceSpanStatus, ToolCall
65
+ from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
57
66
 
58
67
  # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
59
68
  OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
69
+ init_clock_bridge() # initialize clock bridge for perf_counter() to epoch_nanos conversion
70
+
71
+
72
+ class ConfidentInstrumentationSettings(InstrumentationSettings):
73
+
74
+ def __init__(
75
+ self,
76
+ api_key: Optional[str] = None,
77
+ name: Optional[str] = None,
78
+ thread_id: Optional[str] = None,
79
+ user_id: Optional[str] = None,
80
+ metadata: Optional[dict] = None,
81
+ tags: Optional[List[str]] = None,
82
+ metric_collection: Optional[str] = None,
83
+ confident_prompt: Optional[Prompt] = None,
84
+ llm_metric_collection: Optional[str] = None,
85
+ agent_metric_collection: Optional[str] = None,
86
+ tool_metric_collection_map: Optional[dict] = None,
87
+ trace_metric_collection: Optional[str] = None,
88
+ is_test_mode: Optional[bool] = False,
89
+ agent_metrics: Optional[List[BaseMetric]] = None,
90
+ ):
91
+ is_dependency_installed()
92
+
93
+ _environment = os.getenv("CONFIDENT_TRACE_ENVIRONMENT", "development")
94
+ if _environment and _environment in [
95
+ "production",
96
+ "staging",
97
+ "development",
98
+ "testing",
99
+ ]:
100
+ self.environment = _environment
101
+
102
+ self.tool_metric_collection_map = tool_metric_collection_map or {}
103
+ self.name = name
104
+ self.thread_id = thread_id
105
+ self.user_id = user_id
106
+ self.metadata = metadata
107
+ self.tags = tags
108
+ self.metric_collection = metric_collection
109
+ self.confident_prompt = confident_prompt
110
+ self.llm_metric_collection = llm_metric_collection
111
+ self.agent_metric_collection = agent_metric_collection
112
+ self.trace_metric_collection = trace_metric_collection
113
+ self.is_test_mode = is_test_mode
114
+ self.agent_metrics = agent_metrics
115
+
116
+ if not api_key:
117
+ api_key = get_confident_api_key()
118
+ if not api_key:
119
+ raise ValueError("CONFIDENT_API_KEY is not set")
120
+
121
+ trace_provider = TracerProvider()
122
+
123
+ # Pass the entire settings instance instead of individual values
124
+ span_interceptor = SpanInterceptor(self)
125
+ trace_provider.add_span_processor(span_interceptor)
126
+
127
+ if is_test_mode:
128
+ trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
129
+ else:
130
+ trace_provider.add_span_processor(
131
+ BatchSpanProcessor(
132
+ OTLPSpanExporter(
133
+ endpoint=OTLP_ENDPOINT,
134
+ headers={"x-confident-api-key": api_key},
135
+ )
136
+ )
137
+ )
138
+ super().__init__(tracer_provider=trace_provider)
60
139
 
61
140
 
62
141
  class SpanInterceptor(SpanProcessor):
63
- def __init__(self, settings_instance):
142
+ def __init__(self, settings_instance: ConfidentInstrumentationSettings):
64
143
  # Keep a reference to the settings instance instead of copying values
65
- self.settings: ConfidentInstrumentationSettings = settings_instance
144
+ self.settings = settings_instance
66
145
 
67
146
  def on_start(self, span, parent_context):
68
-
69
147
  # set trace uuid
70
148
  _current_trace_context = current_trace_context.get()
71
149
  if _current_trace_context and isinstance(_current_trace_context, Trace):
@@ -151,85 +229,56 @@ class SpanInterceptor(SpanProcessor):
151
229
  )
152
230
 
153
231
  def on_end(self, span):
154
- pass
155
-
232
+ if self.settings.is_test_mode:
233
+ if span.attributes.get("confident.span.type") == "agent":
156
234
 
157
- class ConfidentInstrumentationSettings(InstrumentationSettings):
158
-
159
- name: Optional[str] = None
160
- thread_id: Optional[str] = None
161
- user_id: Optional[str] = None
162
- metadata: Optional[dict] = None
163
- tags: Optional[List[str]] = None
164
- environment: Literal["production", "staging", "development", "testing"] = (
165
- None
166
- )
167
- metric_collection: Optional[str] = None
168
- confident_prompt: Optional[Prompt] = None
169
- llm_metric_collection: Optional[str] = None
170
- agent_metric_collection: Optional[str] = None
171
- tool_metric_collection_map: dict = {}
172
- trace_metric_collection: Optional[str] = None
235
+ def create_agent_span_for_evaluation(
236
+ span: ReadableSpan,
237
+ ) -> AgentSpan:
173
238
 
174
- def __init__(
175
- self,
176
- api_key: Optional[str] = None,
177
- name: Optional[str] = None,
178
- thread_id: Optional[str] = None,
179
- user_id: Optional[str] = None,
180
- metadata: Optional[dict] = None,
181
- tags: Optional[List[str]] = None,
182
- metric_collection: Optional[str] = None,
183
- confident_prompt: Optional[Prompt] = None,
184
- llm_metric_collection: Optional[str] = None,
185
- agent_metric_collection: Optional[str] = None,
186
- tool_metric_collection_map: Optional[dict] = None,
187
- trace_metric_collection: Optional[str] = None,
188
- is_test_mode: Optional[bool] = False,
189
- ):
190
- is_dependency_installed()
239
+ agent_span = (
240
+ ConfidentSpanExporter.prepare_boilerplate_base_span(
241
+ span
242
+ )
243
+ )
191
244
 
192
- _environment = os.getenv("CONFIDENT_TRACE_ENVIRONMENT", "development")
193
- if _environment and _environment in [
194
- "production",
195
- "staging",
196
- "development",
197
- "testing",
198
- ]:
199
- self.environment = _environment
245
+ # tools called
246
+ normalized_messages = normalize_pydantic_ai_messages(span)
247
+ tools_called = []
200
248
 
201
- self.tool_metric_collection_map = tool_metric_collection_map or {}
202
- self.name = name
203
- self.thread_id = thread_id
204
- self.user_id = user_id
205
- self.metadata = metadata
206
- self.tags = tags
207
- self.metric_collection = metric_collection
208
- self.confident_prompt = confident_prompt
209
- self.llm_metric_collection = llm_metric_collection
210
- self.agent_metric_collection = agent_metric_collection
211
- self.trace_metric_collection = trace_metric_collection
249
+ for message in normalized_messages:
250
+ for part in message.get("parts", []):
251
+ if part.get("type") == "tool_call":
252
+ name = part.get("name")
253
+ try:
254
+ input_parameters = json.loads(
255
+ part.get("arguments")
256
+ )
257
+ except Exception:
258
+ input_parameters = {}
212
259
 
213
- if not api_key:
214
- api_key = get_confident_api_key()
215
- if not api_key:
216
- raise ValueError("CONFIDENT_API_KEY is not set")
260
+ tools_called.append(
261
+ ToolCall(
262
+ name=name,
263
+ input_parameters=input_parameters,
264
+ )
265
+ )
217
266
 
218
- trace_provider = TracerProvider()
267
+ # agent_span.tools_called = tools_called
268
+ return agent_span
219
269
 
220
- # Pass the entire settings instance instead of individual values
221
- span_interceptor = SpanInterceptor(self)
222
- trace_provider.add_span_processor(span_interceptor)
270
+ agent_span = create_agent_span_for_evaluation(span)
271
+ agent_span.metrics = self.settings.agent_metrics
223
272
 
224
- if is_test_mode:
225
- trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
226
- else:
227
- trace_provider.add_span_processor(
228
- BatchSpanProcessor(
229
- OTLPSpanExporter(
230
- endpoint=OTLP_ENDPOINT,
231
- headers={"x-confident-api-key": api_key},
273
+ # create a trace for evaluation
274
+ trace = trace_manager.get_trace_by_uuid(agent_span.trace_uuid)
275
+ if not trace:
276
+ trace = trace_manager.start_new_trace(
277
+ trace_uuid=agent_span.trace_uuid
232
278
  )
233
- )
234
- )
235
- super().__init__(tracer_provider=trace_provider)
279
+
280
+ trace.root_spans.append(agent_span)
281
+ trace.status = TraceSpanStatus.SUCCESS
282
+ trace.end_time = perf_counter()
283
+ trace_manager.traces_to_evaluate.append(trace)
284
+ test_exporter.clear_span_json_list()
@@ -27,6 +27,12 @@ from .tool_correctness.tool_correctness import ToolCorrectnessMetric
27
27
  from .json_correctness.json_correctness import JsonCorrectnessMetric
28
28
  from .prompt_alignment.prompt_alignment import PromptAlignmentMetric
29
29
  from .task_completion.task_completion import TaskCompletionMetric
30
+ from .topic_adherence.topic_adherence import TopicAdherenceMetric
31
+ from .step_efficiency.step_efficiency import StepEfficiencyMetric
32
+ from .plan_adherence.plan_adherence import PlanAdherenceMetric
33
+ from .plan_quality.plan_quality import PlanQualityMetric
34
+ from .tool_use.tool_use import ToolUseMetric
35
+ from .goal_accuracy.goal_accuracy import GoalAccuracyMetric
30
36
  from .argument_correctness.argument_correctness import ArgumentCorrectnessMetric
31
37
  from .mcp.mcp_task_completion import MCPTaskCompletionMetric
32
38
  from .mcp.multi_turn_mcp_use_metric import MultiTurnMCPUseMetric
@@ -98,6 +104,13 @@ __all__ = [
98
104
  "TaskCompletionMetric",
99
105
  "ArgumentCorrectnessMetric",
100
106
  "KnowledgeRetentionMetric",
107
+ # Agentic metrics
108
+ "TopicAdherenceMetric",
109
+ "StepEfficiencyMetric",
110
+ "PlanAdherenceMetric",
111
+ "PlanQualityMetric",
112
+ "ToolUseMetric",
113
+ "GoalAccuracyMetric",
101
114
  # Conversational metrics
102
115
  "TurnRelevancyMetric",
103
116
  "ConversationCompletenessMetric",
@@ -27,6 +27,7 @@ class BaseMetric:
27
27
  evaluation_cost: Optional[float] = None
28
28
  verbose_logs: Optional[str] = None
29
29
  skipped = False
30
+ requires_trace: bool = False
30
31
  model = Optional[DeepEvalBaseLLM]
31
32
  using_native_model = Optional[bool]
32
33
 
@@ -17,7 +17,7 @@ from deepeval.metrics.contextual_precision.template import (
17
17
  ContextualPrecisionTemplate,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.metrics.contextual_precision.schema import *
20
+ import deepeval.metrics.contextual_precision.schema as cpschema
21
21
  from deepeval.metrics.api import metric_data_manager
22
22
 
23
23
 
@@ -73,7 +73,7 @@ class ContextualPrecisionMetric(BaseMetric):
73
73
  )
74
74
  )
75
75
  else:
76
- self.verdicts: List[ContextualPrecisionVerdict] = (
76
+ self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
77
77
  self._generate_verdicts(
78
78
  test_case.input,
79
79
  test_case.expected_output,
@@ -113,7 +113,7 @@ class ContextualPrecisionMetric(BaseMetric):
113
113
  _show_indicator=_show_indicator,
114
114
  _in_component=_in_component,
115
115
  ):
116
- self.verdicts: List[ContextualPrecisionVerdict] = (
116
+ self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
117
117
  await self._a_generate_verdicts(
118
118
  test_case.input,
119
119
  test_case.expected_output,
@@ -141,7 +141,7 @@ class ContextualPrecisionMetric(BaseMetric):
141
141
  return None
142
142
 
143
143
  retrieval_contexts_verdicts = [
144
- {"verdict": verdict.verdict, "reasons": verdict.reason}
144
+ {"verdict": verdict.verdict, "reason": verdict.reason}
145
145
  for verdict in self.verdicts
146
146
  ]
147
147
  prompt = self.evaluation_template.generate_reason(
@@ -152,15 +152,15 @@ class ContextualPrecisionMetric(BaseMetric):
152
152
 
153
153
  if self.using_native_model:
154
154
  res, cost = await self.model.a_generate(
155
- prompt, schema=ContextualPrecisionScoreReason
155
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
156
156
  )
157
157
  self.evaluation_cost += cost
158
158
  return res.reason
159
159
  else:
160
160
  try:
161
- res: ContextualPrecisionScoreReason = (
161
+ res: cpschema.ContextualPrecisionScoreReason = (
162
162
  await self.model.a_generate(
163
- prompt, schema=ContextualPrecisionScoreReason
163
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
164
164
  )
165
165
  )
166
166
  return res.reason
@@ -174,7 +174,7 @@ class ContextualPrecisionMetric(BaseMetric):
174
174
  return None
175
175
 
176
176
  retrieval_contexts_verdicts = [
177
- {"verdict": verdict.verdict, "reasons": verdict.reason}
177
+ {"verdict": verdict.verdict, "reason": verdict.reason}
178
178
  for verdict in self.verdicts
179
179
  ]
180
180
  prompt = self.evaluation_template.generate_reason(
@@ -185,14 +185,16 @@ class ContextualPrecisionMetric(BaseMetric):
185
185
 
186
186
  if self.using_native_model:
187
187
  res, cost = self.model.generate(
188
- prompt, schema=ContextualPrecisionScoreReason
188
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
189
189
  )
190
190
  self.evaluation_cost += cost
191
191
  return res.reason
192
192
  else:
193
193
  try:
194
- res: ContextualPrecisionScoreReason = self.model.generate(
195
- prompt, schema=ContextualPrecisionScoreReason
194
+ res: cpschema.ContextualPrecisionScoreReason = (
195
+ self.model.generate(
196
+ prompt, schema=cpschema.ContextualPrecisionScoreReason
197
+ )
196
198
  )
197
199
  return res.reason
198
200
  except TypeError:
@@ -202,21 +204,23 @@ class ContextualPrecisionMetric(BaseMetric):
202
204
 
203
205
  async def _a_generate_verdicts(
204
206
  self, input: str, expected_output: str, retrieval_context: List[str]
205
- ) -> List[ContextualPrecisionVerdict]:
207
+ ) -> List[cpschema.ContextualPrecisionVerdict]:
206
208
  prompt = self.evaluation_template.generate_verdicts(
207
209
  input=input,
208
210
  expected_output=expected_output,
209
211
  retrieval_context=retrieval_context,
210
212
  )
211
213
  if self.using_native_model:
212
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
214
+ res, cost = await self.model.a_generate(
215
+ prompt, schema=cpschema.Verdicts
216
+ )
213
217
  self.evaluation_cost += cost
214
218
  verdicts = [item for item in res.verdicts]
215
219
  return verdicts
216
220
  else:
217
221
  try:
218
- res: Verdicts = await self.model.a_generate(
219
- prompt, schema=Verdicts
222
+ res: cpschema.Verdicts = await self.model.a_generate(
223
+ prompt, schema=cpschema.Verdicts
220
224
  )
221
225
  verdicts = [item for item in res.verdicts]
222
226
  return verdicts
@@ -224,34 +228,36 @@ class ContextualPrecisionMetric(BaseMetric):
224
228
  res = await self.model.a_generate(prompt)
225
229
  data = trimAndLoadJson(res, self)
226
230
  verdicts = [
227
- ContextualPrecisionVerdict(**item)
231
+ cpschema.ContextualPrecisionVerdict(**item)
228
232
  for item in data["verdicts"]
229
233
  ]
230
234
  return verdicts
231
235
 
232
236
  def _generate_verdicts(
233
237
  self, input: str, expected_output: str, retrieval_context: List[str]
234
- ) -> List[ContextualPrecisionVerdict]:
238
+ ) -> List[cpschema.ContextualPrecisionVerdict]:
235
239
  prompt = self.evaluation_template.generate_verdicts(
236
240
  input=input,
237
241
  expected_output=expected_output,
238
242
  retrieval_context=retrieval_context,
239
243
  )
240
244
  if self.using_native_model:
241
- res, cost = self.model.generate(prompt, schema=Verdicts)
245
+ res, cost = self.model.generate(prompt, schema=cpschema.Verdicts)
242
246
  self.evaluation_cost += cost
243
247
  verdicts = [item for item in res.verdicts]
244
248
  return verdicts
245
249
  else:
246
250
  try:
247
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
251
+ res: cpschema.Verdicts = self.model.generate(
252
+ prompt, schema=cpschema.Verdicts
253
+ )
248
254
  verdicts = [item for item in res.verdicts]
249
255
  return verdicts
250
256
  except TypeError:
251
257
  res = self.model.generate(prompt)
252
258
  data = trimAndLoadJson(res, self)
253
259
  verdicts = [
254
- ContextualPrecisionVerdict(**item)
260
+ cpschema.ContextualPrecisionVerdict(**item)
255
261
  for item in data["verdicts"]
256
262
  ]
257
263
  return verdicts
@@ -288,7 +294,7 @@ class ContextualPrecisionMetric(BaseMetric):
288
294
  else:
289
295
  try:
290
296
  self.success = self.score >= self.threshold
291
- except:
297
+ except TypeError:
292
298
  self.success = False
293
299
  return self.success
294
300