deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -10,6 +10,7 @@ from deepeval.test_case import MLLMImage
10
10
  class TestResult:
11
11
  """Returned from run_test"""
12
12
 
13
+ __test__ = False
13
14
  name: str
14
15
  success: bool
15
16
  metrics_data: Union[List[MetricData], None]
@@ -5,8 +5,6 @@ import os
5
5
  import time
6
6
 
7
7
  from deepeval.utils import format_turn
8
- from deepeval.test_case.conversational_test_case import Turn
9
- from deepeval.test_run.api import TurnApi
10
8
  from deepeval.test_run.test_run import TestRunResultDisplay
11
9
  from deepeval.dataset import Golden
12
10
  from deepeval.metrics import (
@@ -28,7 +26,6 @@ from deepeval.evaluate.types import TestResult
28
26
  from deepeval.tracing.api import TraceApi, BaseApiSpan, TraceSpanApiStatus
29
27
  from deepeval.tracing.tracing import BaseSpan, Trace
30
28
  from deepeval.tracing.types import TraceSpanStatus
31
- from deepeval.constants import PYTEST_RUN_TEST_NAME
32
29
  from deepeval.tracing.utils import (
33
30
  perf_counter_to_datetime,
34
31
  to_zod_compatible_iso,
@@ -133,121 +130,6 @@ def create_test_result(
133
130
  )
134
131
 
135
132
 
136
- def create_api_turn(turn: Turn, index: int) -> TurnApi:
137
- return TurnApi(
138
- role=turn.role,
139
- content=turn.content,
140
- user_id=turn.user_id,
141
- retrievalContext=turn.retrieval_context,
142
- toolsCalled=turn.tools_called,
143
- additionalMetadata=turn.additional_metadata,
144
- order=index,
145
- )
146
-
147
-
148
- def create_api_test_case(
149
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
150
- trace: Optional[TraceApi] = None,
151
- index: Optional[int] = None,
152
- ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
153
- if isinstance(test_case, ConversationalTestCase):
154
- order = (
155
- test_case._dataset_rank
156
- if test_case._dataset_rank is not None
157
- else index
158
- )
159
- if test_case.name:
160
- name = test_case.name
161
- else:
162
- name = os.getenv(
163
- PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
164
- )
165
-
166
- api_test_case = ConversationalApiTestCase(
167
- name=name,
168
- success=True,
169
- metricsData=[],
170
- runDuration=0,
171
- evaluationCost=None,
172
- order=order,
173
- scenario=test_case.scenario,
174
- expectedOutcome=test_case.expected_outcome,
175
- userDescription=test_case.user_description,
176
- context=test_case.context,
177
- tags=test_case.tags,
178
- comments=test_case.comments,
179
- additionalMetadata=test_case.additional_metadata,
180
- )
181
- api_test_case.turns = [
182
- create_api_turn(
183
- turn=turn,
184
- index=index,
185
- )
186
- for index, turn in enumerate(test_case.turns)
187
- ]
188
-
189
- return api_test_case
190
- else:
191
- order = (
192
- test_case._dataset_rank
193
- if test_case._dataset_rank is not None
194
- else index
195
- )
196
-
197
- success = True
198
- if test_case.name is not None:
199
- name = test_case.name
200
- else:
201
- name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
202
- metrics_data = []
203
-
204
- if isinstance(test_case, LLMTestCase):
205
- api_test_case = LLMApiTestCase(
206
- name=name,
207
- input=test_case.input,
208
- actualOutput=test_case.actual_output,
209
- expectedOutput=test_case.expected_output,
210
- context=test_case.context,
211
- retrievalContext=test_case.retrieval_context,
212
- toolsCalled=test_case.tools_called,
213
- expectedTools=test_case.expected_tools,
214
- tokenCost=test_case.token_cost,
215
- completionTime=test_case.completion_time,
216
- tags=test_case.tags,
217
- success=success,
218
- metricsData=metrics_data,
219
- runDuration=None,
220
- evaluationCost=None,
221
- order=order,
222
- additionalMetadata=test_case.additional_metadata,
223
- comments=test_case.comments,
224
- trace=trace,
225
- )
226
- elif isinstance(test_case, MLLMTestCase):
227
- api_test_case = LLMApiTestCase(
228
- name=name,
229
- input="",
230
- multimodalInput=test_case.input,
231
- multimodalActualOutput=test_case.actual_output,
232
- multimodalExpectedOutput=test_case.expected_output,
233
- multimodalRetrievalContext=test_case.retrieval_context,
234
- multimodalContext=test_case.context,
235
- toolsCalled=test_case.tools_called,
236
- expectedTools=test_case.expected_tools,
237
- tokenCost=test_case.token_cost,
238
- completionTime=test_case.completion_time,
239
- success=success,
240
- metricsData=metrics_data,
241
- runDuration=None,
242
- evaluationCost=None,
243
- order=order,
244
- additionalMetadata=test_case.additional_metadata,
245
- comments=test_case.comments,
246
- )
247
- # llm_test_case_lookup_map[instance_id] = api_test_case
248
- return api_test_case
249
-
250
-
251
133
  def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
252
134
  return TraceApi(
253
135
  uuid=trace.uuid,
@@ -309,6 +191,26 @@ def validate_assert_test_inputs(
309
191
  "Both 'test_case' and 'metrics' must be provided together."
310
192
  )
311
193
 
194
+ if test_case and metrics:
195
+ if isinstance(test_case, LLMTestCase) and not all(
196
+ isinstance(metric, BaseMetric) for metric in metrics
197
+ ):
198
+ raise ValueError(
199
+ "All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
200
+ )
201
+ if isinstance(test_case, ConversationalTestCase) and not all(
202
+ isinstance(metric, BaseConversationalMetric) for metric in metrics
203
+ ):
204
+ raise ValueError(
205
+ "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
206
+ )
207
+ if isinstance(test_case, MLLMTestCase) and not all(
208
+ isinstance(metric, BaseMultimodalMetric) for metric in metrics
209
+ ):
210
+ raise ValueError(
211
+ "All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
212
+ )
213
+
312
214
  if not ((golden and observed_callback) or (test_case and metrics)):
313
215
  raise ValueError(
314
216
  "You must provide either ('golden' + 'observed_callback') or ('test_case' + 'metrics')."
@@ -577,6 +479,18 @@ def count_metrics_in_trace(trace: Trace) -> int:
577
479
  return sum(count_metrics_recursive(span) for span in trace.root_spans)
578
480
 
579
481
 
482
+ def count_total_metrics_for_trace(trace: Trace) -> int:
483
+ """Span subtree metrics + trace-level metrics."""
484
+ return count_metrics_in_trace(trace=trace) + len(trace.metrics or [])
485
+
486
+
487
+ def count_metrics_in_span_subtree(span: BaseSpan) -> int:
488
+ total = len(span.metrics or [])
489
+ for c in span.children or []:
490
+ total += count_metrics_in_span_subtree(c)
491
+ return total
492
+
493
+
580
494
  def extract_trace_test_results(trace_api: TraceApi) -> List[TestResult]:
581
495
  test_results: List[TestResult] = []
582
496
  # extract trace result
@@ -619,7 +533,7 @@ def extract_span_test_results(span_api: BaseApiSpan) -> List[TestResult]:
619
533
  test_results.append(
620
534
  TestResult(
621
535
  name=span_api.name,
622
- success=span_api.status == "SUCCESS",
536
+ success=span_api.status == TraceSpanApiStatus.SUCCESS,
623
537
  metrics_data=span_api.metrics_data,
624
538
  input=span_api.input,
625
539
  actual_output=span_api.output,
@@ -1,3 +1,9 @@
1
1
  from .handler import instrument_crewai
2
+ from .subs import (
3
+ DeepEvalCrew as Crew,
4
+ DeepEvalAgent as Agent,
5
+ DeepEvalLLM as LLM,
6
+ )
7
+ from .tool import tool
2
8
 
3
- __all__ = ["instrument_crewai"]
9
+ __all__ = ["instrument_crewai", "Crew", "Agent", "LLM", "tool"]
@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
15
  try:
16
- from crewai.utilities.events.base_event_listener import BaseEventListener
16
+ from crewai.events import BaseEventListener
17
17
  from crewai.events import (
18
18
  CrewKickoffStartedEvent,
19
19
  CrewKickoffCompletedEvent,
@@ -0,0 +1,51 @@
1
+ from typing import List, Optional, Type, TypeVar
2
+ from pydantic import PrivateAttr
3
+
4
+ from deepeval.metrics.base_metric import BaseMetric
5
+
6
+ try:
7
+ from crewai import Crew, Agent, LLM
8
+
9
+ is_crewai_installed = True
10
+ except ImportError:
11
+ is_crewai_installed = False
12
+
13
+
14
+ def is_crewai_installed():
15
+ if not is_crewai_installed:
16
+ raise ImportError(
17
+ "CrewAI is not installed. Please install it with `pip install crewai`."
18
+ )
19
+
20
+
21
+ T = TypeVar("T")
22
+
23
+
24
+ def create_deepeval_class(base_class: Type[T], class_name: str) -> Type[T]:
25
+ """Factory function to create DeepEval-enabled CrewAI classes"""
26
+
27
+ class DeepEvalClass(base_class):
28
+ _metric_collection: Optional[str] = PrivateAttr(default=None)
29
+ _metrics: Optional[List[BaseMetric]] = PrivateAttr(default=None)
30
+
31
+ def __init__(
32
+ self,
33
+ *args,
34
+ metrics: Optional[List[BaseMetric]] = None,
35
+ metric_collection: Optional[str] = None,
36
+ **kwargs
37
+ ):
38
+ is_crewai_installed()
39
+ super().__init__(*args, **kwargs)
40
+ self._metric_collection = metric_collection
41
+ self._metrics = metrics
42
+
43
+ DeepEvalClass.__name__ = class_name
44
+ DeepEvalClass.__qualname__ = class_name
45
+ return DeepEvalClass
46
+
47
+
48
+ # Create the classes
49
+ DeepEvalCrew = create_deepeval_class(Crew, "DeepEvalCrew")
50
+ DeepEvalAgent = create_deepeval_class(Agent, "DeepEvalAgent")
51
+ DeepEvalLLM = create_deepeval_class(LLM, "DeepEvalLLM")
@@ -0,0 +1,71 @@
1
+ import functools
2
+ from typing import Callable
3
+ from crewai.tools import tool as crewai_tool
4
+
5
+ from deepeval.tracing.context import current_span_context
6
+ from deepeval.tracing.types import ToolSpan
7
+
8
+
9
+ def tool(*args, metric=None, metric_collection=None, **kwargs) -> Callable:
10
+ """
11
+ Simple wrapper around crewai.tools.tool that:
12
+ - prints the original function's input and output
13
+ - accepts additional parameters: metric and metric_collection (unused, for compatibility)
14
+ - remains backward compatible with CrewAI's decorator usage patterns
15
+ """
16
+ crewai_kwargs = kwargs
17
+
18
+ # Case 1: @tool (function passed directly)
19
+ if len(args) == 1 and callable(args[0]):
20
+ f = args[0]
21
+ tool_name = f.__name__
22
+
23
+ @functools.wraps(f)
24
+ def wrapped(*f_args, **f_kwargs):
25
+ current_span = current_span_context.get()
26
+ if current_span and isinstance(current_span, ToolSpan):
27
+ current_span.metric_collection = metric_collection
28
+ current_span.metrics = metric
29
+ result = f(*f_args, **f_kwargs)
30
+ return result
31
+
32
+ return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
33
+
34
+ # Case 2: @tool("name")
35
+ if len(args) == 1 and isinstance(args[0], str):
36
+ tool_name = args[0]
37
+
38
+ def _decorator(f: Callable) -> Callable:
39
+ @functools.wraps(f)
40
+ def wrapped(*f_args, **f_kwargs):
41
+ current_span = current_span_context.get()
42
+ if current_span and isinstance(current_span, ToolSpan):
43
+ current_span.metric_collection = metric_collection
44
+ current_span.metrics = metric
45
+ result = f(*f_args, **f_kwargs)
46
+ return result
47
+
48
+ return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
49
+
50
+ return _decorator
51
+
52
+ # Case 3: @tool(result_as_answer=True, ...) — kwargs only
53
+ if len(args) == 0:
54
+
55
+ def _decorator(f: Callable) -> Callable:
56
+ tool_name = f.__name__
57
+
58
+ @functools.wraps(f)
59
+ def wrapped(*f_args, **f_kwargs):
60
+ current_span = current_span_context.get()
61
+ if current_span and isinstance(current_span, ToolSpan):
62
+ current_span.metric_collection = metric_collection
63
+ current_span.metrics = metric
64
+ result = f(*f_args, **f_kwargs)
65
+ return result
66
+
67
+ return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
68
+
69
+ return _decorator
70
+
71
+ raise ValueError("Invalid arguments")
@@ -3,6 +3,7 @@ from crewai.crew import Crew
3
3
  from crewai.agent import Agent
4
4
  from functools import wraps
5
5
  from deepeval.tracing.tracing import Observer
6
+ from typing import Any
6
7
 
7
8
 
8
9
  def wrap_crew_kickoff():
@@ -10,7 +11,13 @@ def wrap_crew_kickoff():
10
11
 
11
12
  @wraps(original_kickoff)
12
13
  def wrapper(self, *args, **kwargs):
13
- with Observer(span_type="crew", func_name="kickoff"):
14
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
15
+ with Observer(
16
+ span_type="crew",
17
+ func_name="kickoff",
18
+ metric_collection=metric_collection,
19
+ metrics=metrics,
20
+ ):
14
21
  result = original_kickoff(self, *args, **kwargs)
15
22
 
16
23
  return result
@@ -23,7 +30,13 @@ def wrap_crew_kickoff_for_each():
23
30
 
24
31
  @wraps(original_kickoff_for_each)
25
32
  def wrapper(self, *args, **kwargs):
26
- with Observer(span_type="crew", func_name="kickoff_for_each"):
33
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
34
+ with Observer(
35
+ span_type="crew",
36
+ func_name="kickoff_for_each",
37
+ metric_collection=metric_collection,
38
+ metrics=metrics,
39
+ ):
27
40
  result = original_kickoff_for_each(self, *args, **kwargs)
28
41
 
29
42
  return result
@@ -36,7 +49,13 @@ def wrap_crew_kickoff_async():
36
49
 
37
50
  @wraps(original_kickoff_async)
38
51
  async def wrapper(self, *args, **kwargs):
39
- with Observer(span_type="crew", func_name="kickoff_async"):
52
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
53
+ with Observer(
54
+ span_type="crew",
55
+ func_name="kickoff_async",
56
+ metric_collection=metric_collection,
57
+ metrics=metrics,
58
+ ):
40
59
  result = await original_kickoff_async(self, *args, **kwargs)
41
60
 
42
61
  return result
@@ -49,7 +68,13 @@ def wrap_crew_kickoff_for_each_async():
49
68
 
50
69
  @wraps(original_kickoff_for_each_async)
51
70
  async def wrapper(self, *args, **kwargs):
52
- with Observer(span_type="crew", func_name="kickoff_for_each_async"):
71
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
72
+ with Observer(
73
+ span_type="crew",
74
+ func_name="kickoff_for_each_async",
75
+ metric_collection=metric_collection,
76
+ metrics=metrics,
77
+ ):
53
78
  result = await original_kickoff_for_each_async(
54
79
  self, *args, **kwargs
55
80
  )
@@ -64,10 +89,13 @@ def wrap_llm_call():
64
89
 
65
90
  @wraps(original_llm_call)
66
91
  def wrapper(self, *args, **kwargs):
92
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
67
93
  with Observer(
68
94
  span_type="llm",
69
95
  func_name="call",
70
96
  observe_kwargs={"model": "temp_model"},
97
+ metric_collection=metric_collection,
98
+ metrics=metrics,
71
99
  ):
72
100
  result = original_llm_call(self, *args, **kwargs)
73
101
  return result
@@ -80,8 +108,20 @@ def wrap_agent_execute_task():
80
108
 
81
109
  @wraps(original_execute_task)
82
110
  def wrapper(self, *args, **kwargs):
83
- with Observer(span_type="agent", func_name="execute_task"):
111
+ metric_collection, metrics = _check_metrics_and_metric_collection(self)
112
+ with Observer(
113
+ span_type="agent",
114
+ func_name="execute_task",
115
+ metric_collection=metric_collection,
116
+ metrics=metrics,
117
+ ):
84
118
  result = original_execute_task(self, *args, **kwargs)
85
119
  return result
86
120
 
87
121
  Agent.execute_task = wrapper
122
+
123
+
124
+ def _check_metrics_and_metric_collection(obj: Any):
125
+ metric_collection = getattr(obj, "_metric_collection", None)
126
+ metrics = getattr(obj, "_metrics", None)
127
+ return metric_collection, metrics
@@ -1,10 +1,6 @@
1
1
  from .handler import instrument_llama_index
2
- from .agent.patched import FunctionAgent, ReActAgent, CodeActAgent
3
2
 
4
3
 
5
4
  __all__ = [
6
5
  "instrument_llama_index",
7
- "FunctionAgent",
8
- "ReActAgent",
9
- "CodeActAgent",
10
6
  ]
@@ -5,6 +5,10 @@ import uuid
5
5
  from deepeval.telemetry import capture_tracing_integration
6
6
  from deepeval.tracing import trace_manager
7
7
  from deepeval.tracing.types import AgentSpan, BaseSpan, LlmSpan, TraceSpanStatus
8
+ from deepeval.tracing.trace_context import (
9
+ current_llm_context,
10
+ current_agent_context,
11
+ )
8
12
 
9
13
  try:
10
14
  from llama_index.core.instrumentation.events.base import BaseEvent
@@ -22,11 +26,6 @@ try:
22
26
  LLMChatEndEvent,
23
27
  )
24
28
  from llama_index_instrumentation.dispatcher import Dispatcher
25
- from deepeval.integrations.llama_index.agent.patched import (
26
- FunctionAgent as PatchedFunctionAgent,
27
- ReActAgent as PatchedReActAgent,
28
- CodeActAgent as PatchedCodeActAgent,
29
- )
30
29
  from deepeval.integrations.llama_index.utils import (
31
30
  parse_id,
32
31
  prepare_input_llm_test_case_params,
@@ -67,6 +66,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
67
66
  ).strip()
68
67
  input_messages.append({"role": role, "content": content})
69
68
 
69
+ llm_span_context = current_llm_context.get()
70
70
  # create the span
71
71
  llm_span = LlmSpan(
72
72
  name="ConfidentLLMSpan",
@@ -83,6 +83,12 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
83
83
  ), # check the model name not coming in this option
84
84
  input=input_messages,
85
85
  output="",
86
+ metrics=llm_span_context.metrics if llm_span_context else None,
87
+ metric_collection=(
88
+ llm_span_context.metric_collection
89
+ if llm_span_context
90
+ else None
91
+ ),
86
92
  )
87
93
  trace_manager.add_span(llm_span)
88
94
  trace_manager.add_span_to_trace(llm_span)
@@ -144,6 +150,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
144
150
 
145
151
  # conditions to qualify as agent start run span
146
152
  if method_name == "run":
153
+ agent_span_context = current_agent_context.get()
147
154
  span = AgentSpan(
148
155
  uuid=id_,
149
156
  status=TraceSpanStatus.IN_PROGRESS,
@@ -153,24 +160,16 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
153
160
  start_time=perf_counter(),
154
161
  name="Agent", # TODO: decide the name of the span
155
162
  input=bound_args.arguments,
163
+ metrics=(
164
+ agent_span_context.metrics if agent_span_context else None
165
+ ),
166
+ metric_collection=(
167
+ agent_span_context.metric_collection
168
+ if agent_span_context
169
+ else None
170
+ ),
156
171
  )
157
172
 
158
- # check if the instance is a PatchedFunctionAgent
159
- if isinstance(instance, PatchedFunctionAgent):
160
- span.name = "FunctionAgent"
161
- span.metric_collection = instance.metric_collection
162
- span.metrics = instance.metrics
163
-
164
- if isinstance(instance, PatchedReActAgent):
165
- span.name = "ReActAgent"
166
- span.metric_collection = instance.metric_collection
167
- span.metrics = instance.metrics
168
-
169
- if isinstance(instance, PatchedCodeActAgent):
170
- span.name = "CodeActAgent"
171
- span.metric_collection = instance.metric_collection
172
- span.metrics = instance.metrics
173
-
174
173
  # prepare input test case params for the span
175
174
  prepare_input_llm_test_case_params(
176
175
  class_name, method_name, span, bound_args.arguments