deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/types.py
CHANGED
deepeval/evaluate/utils.py
CHANGED
|
@@ -5,8 +5,6 @@ import os
|
|
|
5
5
|
import time
|
|
6
6
|
|
|
7
7
|
from deepeval.utils import format_turn
|
|
8
|
-
from deepeval.test_case.conversational_test_case import Turn
|
|
9
|
-
from deepeval.test_run.api import TurnApi
|
|
10
8
|
from deepeval.test_run.test_run import TestRunResultDisplay
|
|
11
9
|
from deepeval.dataset import Golden
|
|
12
10
|
from deepeval.metrics import (
|
|
@@ -28,7 +26,6 @@ from deepeval.evaluate.types import TestResult
|
|
|
28
26
|
from deepeval.tracing.api import TraceApi, BaseApiSpan, TraceSpanApiStatus
|
|
29
27
|
from deepeval.tracing.tracing import BaseSpan, Trace
|
|
30
28
|
from deepeval.tracing.types import TraceSpanStatus
|
|
31
|
-
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
32
29
|
from deepeval.tracing.utils import (
|
|
33
30
|
perf_counter_to_datetime,
|
|
34
31
|
to_zod_compatible_iso,
|
|
@@ -133,121 +130,6 @@ def create_test_result(
|
|
|
133
130
|
)
|
|
134
131
|
|
|
135
132
|
|
|
136
|
-
def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
137
|
-
return TurnApi(
|
|
138
|
-
role=turn.role,
|
|
139
|
-
content=turn.content,
|
|
140
|
-
user_id=turn.user_id,
|
|
141
|
-
retrievalContext=turn.retrieval_context,
|
|
142
|
-
toolsCalled=turn.tools_called,
|
|
143
|
-
additionalMetadata=turn.additional_metadata,
|
|
144
|
-
order=index,
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def create_api_test_case(
|
|
149
|
-
test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
|
|
150
|
-
trace: Optional[TraceApi] = None,
|
|
151
|
-
index: Optional[int] = None,
|
|
152
|
-
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
153
|
-
if isinstance(test_case, ConversationalTestCase):
|
|
154
|
-
order = (
|
|
155
|
-
test_case._dataset_rank
|
|
156
|
-
if test_case._dataset_rank is not None
|
|
157
|
-
else index
|
|
158
|
-
)
|
|
159
|
-
if test_case.name:
|
|
160
|
-
name = test_case.name
|
|
161
|
-
else:
|
|
162
|
-
name = os.getenv(
|
|
163
|
-
PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
api_test_case = ConversationalApiTestCase(
|
|
167
|
-
name=name,
|
|
168
|
-
success=True,
|
|
169
|
-
metricsData=[],
|
|
170
|
-
runDuration=0,
|
|
171
|
-
evaluationCost=None,
|
|
172
|
-
order=order,
|
|
173
|
-
scenario=test_case.scenario,
|
|
174
|
-
expectedOutcome=test_case.expected_outcome,
|
|
175
|
-
userDescription=test_case.user_description,
|
|
176
|
-
context=test_case.context,
|
|
177
|
-
tags=test_case.tags,
|
|
178
|
-
comments=test_case.comments,
|
|
179
|
-
additionalMetadata=test_case.additional_metadata,
|
|
180
|
-
)
|
|
181
|
-
api_test_case.turns = [
|
|
182
|
-
create_api_turn(
|
|
183
|
-
turn=turn,
|
|
184
|
-
index=index,
|
|
185
|
-
)
|
|
186
|
-
for index, turn in enumerate(test_case.turns)
|
|
187
|
-
]
|
|
188
|
-
|
|
189
|
-
return api_test_case
|
|
190
|
-
else:
|
|
191
|
-
order = (
|
|
192
|
-
test_case._dataset_rank
|
|
193
|
-
if test_case._dataset_rank is not None
|
|
194
|
-
else index
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
success = True
|
|
198
|
-
if test_case.name is not None:
|
|
199
|
-
name = test_case.name
|
|
200
|
-
else:
|
|
201
|
-
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
202
|
-
metrics_data = []
|
|
203
|
-
|
|
204
|
-
if isinstance(test_case, LLMTestCase):
|
|
205
|
-
api_test_case = LLMApiTestCase(
|
|
206
|
-
name=name,
|
|
207
|
-
input=test_case.input,
|
|
208
|
-
actualOutput=test_case.actual_output,
|
|
209
|
-
expectedOutput=test_case.expected_output,
|
|
210
|
-
context=test_case.context,
|
|
211
|
-
retrievalContext=test_case.retrieval_context,
|
|
212
|
-
toolsCalled=test_case.tools_called,
|
|
213
|
-
expectedTools=test_case.expected_tools,
|
|
214
|
-
tokenCost=test_case.token_cost,
|
|
215
|
-
completionTime=test_case.completion_time,
|
|
216
|
-
tags=test_case.tags,
|
|
217
|
-
success=success,
|
|
218
|
-
metricsData=metrics_data,
|
|
219
|
-
runDuration=None,
|
|
220
|
-
evaluationCost=None,
|
|
221
|
-
order=order,
|
|
222
|
-
additionalMetadata=test_case.additional_metadata,
|
|
223
|
-
comments=test_case.comments,
|
|
224
|
-
trace=trace,
|
|
225
|
-
)
|
|
226
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
227
|
-
api_test_case = LLMApiTestCase(
|
|
228
|
-
name=name,
|
|
229
|
-
input="",
|
|
230
|
-
multimodalInput=test_case.input,
|
|
231
|
-
multimodalActualOutput=test_case.actual_output,
|
|
232
|
-
multimodalExpectedOutput=test_case.expected_output,
|
|
233
|
-
multimodalRetrievalContext=test_case.retrieval_context,
|
|
234
|
-
multimodalContext=test_case.context,
|
|
235
|
-
toolsCalled=test_case.tools_called,
|
|
236
|
-
expectedTools=test_case.expected_tools,
|
|
237
|
-
tokenCost=test_case.token_cost,
|
|
238
|
-
completionTime=test_case.completion_time,
|
|
239
|
-
success=success,
|
|
240
|
-
metricsData=metrics_data,
|
|
241
|
-
runDuration=None,
|
|
242
|
-
evaluationCost=None,
|
|
243
|
-
order=order,
|
|
244
|
-
additionalMetadata=test_case.additional_metadata,
|
|
245
|
-
comments=test_case.comments,
|
|
246
|
-
)
|
|
247
|
-
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
248
|
-
return api_test_case
|
|
249
|
-
|
|
250
|
-
|
|
251
133
|
def create_api_trace(trace: Trace, golden: Golden) -> TraceApi:
|
|
252
134
|
return TraceApi(
|
|
253
135
|
uuid=trace.uuid,
|
|
@@ -309,6 +191,26 @@ def validate_assert_test_inputs(
|
|
|
309
191
|
"Both 'test_case' and 'metrics' must be provided together."
|
|
310
192
|
)
|
|
311
193
|
|
|
194
|
+
if test_case and metrics:
|
|
195
|
+
if isinstance(test_case, LLMTestCase) and not all(
|
|
196
|
+
isinstance(metric, BaseMetric) for metric in metrics
|
|
197
|
+
):
|
|
198
|
+
raise ValueError(
|
|
199
|
+
"All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
|
|
200
|
+
)
|
|
201
|
+
if isinstance(test_case, ConversationalTestCase) and not all(
|
|
202
|
+
isinstance(metric, BaseConversationalMetric) for metric in metrics
|
|
203
|
+
):
|
|
204
|
+
raise ValueError(
|
|
205
|
+
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
206
|
+
)
|
|
207
|
+
if isinstance(test_case, MLLMTestCase) and not all(
|
|
208
|
+
isinstance(metric, BaseMultimodalMetric) for metric in metrics
|
|
209
|
+
):
|
|
210
|
+
raise ValueError(
|
|
211
|
+
"All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
|
|
212
|
+
)
|
|
213
|
+
|
|
312
214
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
313
215
|
raise ValueError(
|
|
314
216
|
"You must provide either ('golden' + 'observed_callback') or ('test_case' + 'metrics')."
|
|
@@ -577,6 +479,18 @@ def count_metrics_in_trace(trace: Trace) -> int:
|
|
|
577
479
|
return sum(count_metrics_recursive(span) for span in trace.root_spans)
|
|
578
480
|
|
|
579
481
|
|
|
482
|
+
def count_total_metrics_for_trace(trace: Trace) -> int:
|
|
483
|
+
"""Span subtree metrics + trace-level metrics."""
|
|
484
|
+
return count_metrics_in_trace(trace=trace) + len(trace.metrics or [])
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def count_metrics_in_span_subtree(span: BaseSpan) -> int:
|
|
488
|
+
total = len(span.metrics or [])
|
|
489
|
+
for c in span.children or []:
|
|
490
|
+
total += count_metrics_in_span_subtree(c)
|
|
491
|
+
return total
|
|
492
|
+
|
|
493
|
+
|
|
580
494
|
def extract_trace_test_results(trace_api: TraceApi) -> List[TestResult]:
|
|
581
495
|
test_results: List[TestResult] = []
|
|
582
496
|
# extract trace result
|
|
@@ -619,7 +533,7 @@ def extract_span_test_results(span_api: BaseApiSpan) -> List[TestResult]:
|
|
|
619
533
|
test_results.append(
|
|
620
534
|
TestResult(
|
|
621
535
|
name=span_api.name,
|
|
622
|
-
success=span_api.status ==
|
|
536
|
+
success=span_api.status == TraceSpanApiStatus.SUCCESS,
|
|
623
537
|
metrics_data=span_api.metrics_data,
|
|
624
538
|
input=span_api.input,
|
|
625
539
|
actual_output=span_api.output,
|
|
@@ -1,3 +1,9 @@
|
|
|
1
1
|
from .handler import instrument_crewai
|
|
2
|
+
from .subs import (
|
|
3
|
+
DeepEvalCrew as Crew,
|
|
4
|
+
DeepEvalAgent as Agent,
|
|
5
|
+
DeepEvalLLM as LLM,
|
|
6
|
+
)
|
|
7
|
+
from .tool import tool
|
|
2
8
|
|
|
3
|
-
__all__ = ["instrument_crewai"]
|
|
9
|
+
__all__ = ["instrument_crewai", "Crew", "Agent", "LLM", "tool"]
|
|
@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
|
-
from crewai.
|
|
16
|
+
from crewai.events import BaseEventListener
|
|
17
17
|
from crewai.events import (
|
|
18
18
|
CrewKickoffStartedEvent,
|
|
19
19
|
CrewKickoffCompletedEvent,
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import List, Optional, Type, TypeVar
|
|
2
|
+
from pydantic import PrivateAttr
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from crewai import Crew, Agent, LLM
|
|
8
|
+
|
|
9
|
+
is_crewai_installed = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
is_crewai_installed = False
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def is_crewai_installed():
|
|
15
|
+
if not is_crewai_installed:
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"CrewAI is not installed. Please install it with `pip install crewai`."
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create_deepeval_class(base_class: Type[T], class_name: str) -> Type[T]:
|
|
25
|
+
"""Factory function to create DeepEval-enabled CrewAI classes"""
|
|
26
|
+
|
|
27
|
+
class DeepEvalClass(base_class):
|
|
28
|
+
_metric_collection: Optional[str] = PrivateAttr(default=None)
|
|
29
|
+
_metrics: Optional[List[BaseMetric]] = PrivateAttr(default=None)
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*args,
|
|
34
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
35
|
+
metric_collection: Optional[str] = None,
|
|
36
|
+
**kwargs
|
|
37
|
+
):
|
|
38
|
+
is_crewai_installed()
|
|
39
|
+
super().__init__(*args, **kwargs)
|
|
40
|
+
self._metric_collection = metric_collection
|
|
41
|
+
self._metrics = metrics
|
|
42
|
+
|
|
43
|
+
DeepEvalClass.__name__ = class_name
|
|
44
|
+
DeepEvalClass.__qualname__ = class_name
|
|
45
|
+
return DeepEvalClass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Create the classes
|
|
49
|
+
DeepEvalCrew = create_deepeval_class(Crew, "DeepEvalCrew")
|
|
50
|
+
DeepEvalAgent = create_deepeval_class(Agent, "DeepEvalAgent")
|
|
51
|
+
DeepEvalLLM = create_deepeval_class(LLM, "DeepEvalLLM")
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from crewai.tools import tool as crewai_tool
|
|
4
|
+
|
|
5
|
+
from deepeval.tracing.context import current_span_context
|
|
6
|
+
from deepeval.tracing.types import ToolSpan
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def tool(*args, metric=None, metric_collection=None, **kwargs) -> Callable:
|
|
10
|
+
"""
|
|
11
|
+
Simple wrapper around crewai.tools.tool that:
|
|
12
|
+
- prints the original function's input and output
|
|
13
|
+
- accepts additional parameters: metric and metric_collection (unused, for compatibility)
|
|
14
|
+
- remains backward compatible with CrewAI's decorator usage patterns
|
|
15
|
+
"""
|
|
16
|
+
crewai_kwargs = kwargs
|
|
17
|
+
|
|
18
|
+
# Case 1: @tool (function passed directly)
|
|
19
|
+
if len(args) == 1 and callable(args[0]):
|
|
20
|
+
f = args[0]
|
|
21
|
+
tool_name = f.__name__
|
|
22
|
+
|
|
23
|
+
@functools.wraps(f)
|
|
24
|
+
def wrapped(*f_args, **f_kwargs):
|
|
25
|
+
current_span = current_span_context.get()
|
|
26
|
+
if current_span and isinstance(current_span, ToolSpan):
|
|
27
|
+
current_span.metric_collection = metric_collection
|
|
28
|
+
current_span.metrics = metric
|
|
29
|
+
result = f(*f_args, **f_kwargs)
|
|
30
|
+
return result
|
|
31
|
+
|
|
32
|
+
return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
|
|
33
|
+
|
|
34
|
+
# Case 2: @tool("name")
|
|
35
|
+
if len(args) == 1 and isinstance(args[0], str):
|
|
36
|
+
tool_name = args[0]
|
|
37
|
+
|
|
38
|
+
def _decorator(f: Callable) -> Callable:
|
|
39
|
+
@functools.wraps(f)
|
|
40
|
+
def wrapped(*f_args, **f_kwargs):
|
|
41
|
+
current_span = current_span_context.get()
|
|
42
|
+
if current_span and isinstance(current_span, ToolSpan):
|
|
43
|
+
current_span.metric_collection = metric_collection
|
|
44
|
+
current_span.metrics = metric
|
|
45
|
+
result = f(*f_args, **f_kwargs)
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
|
|
49
|
+
|
|
50
|
+
return _decorator
|
|
51
|
+
|
|
52
|
+
# Case 3: @tool(result_as_answer=True, ...) — kwargs only
|
|
53
|
+
if len(args) == 0:
|
|
54
|
+
|
|
55
|
+
def _decorator(f: Callable) -> Callable:
|
|
56
|
+
tool_name = f.__name__
|
|
57
|
+
|
|
58
|
+
@functools.wraps(f)
|
|
59
|
+
def wrapped(*f_args, **f_kwargs):
|
|
60
|
+
current_span = current_span_context.get()
|
|
61
|
+
if current_span and isinstance(current_span, ToolSpan):
|
|
62
|
+
current_span.metric_collection = metric_collection
|
|
63
|
+
current_span.metrics = metric
|
|
64
|
+
result = f(*f_args, **f_kwargs)
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
return crewai_tool(tool_name, **crewai_kwargs)(wrapped)
|
|
68
|
+
|
|
69
|
+
return _decorator
|
|
70
|
+
|
|
71
|
+
raise ValueError("Invalid arguments")
|
|
@@ -3,6 +3,7 @@ from crewai.crew import Crew
|
|
|
3
3
|
from crewai.agent import Agent
|
|
4
4
|
from functools import wraps
|
|
5
5
|
from deepeval.tracing.tracing import Observer
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
def wrap_crew_kickoff():
|
|
@@ -10,7 +11,13 @@ def wrap_crew_kickoff():
|
|
|
10
11
|
|
|
11
12
|
@wraps(original_kickoff)
|
|
12
13
|
def wrapper(self, *args, **kwargs):
|
|
13
|
-
|
|
14
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
15
|
+
with Observer(
|
|
16
|
+
span_type="crew",
|
|
17
|
+
func_name="kickoff",
|
|
18
|
+
metric_collection=metric_collection,
|
|
19
|
+
metrics=metrics,
|
|
20
|
+
):
|
|
14
21
|
result = original_kickoff(self, *args, **kwargs)
|
|
15
22
|
|
|
16
23
|
return result
|
|
@@ -23,7 +30,13 @@ def wrap_crew_kickoff_for_each():
|
|
|
23
30
|
|
|
24
31
|
@wraps(original_kickoff_for_each)
|
|
25
32
|
def wrapper(self, *args, **kwargs):
|
|
26
|
-
|
|
33
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
34
|
+
with Observer(
|
|
35
|
+
span_type="crew",
|
|
36
|
+
func_name="kickoff_for_each",
|
|
37
|
+
metric_collection=metric_collection,
|
|
38
|
+
metrics=metrics,
|
|
39
|
+
):
|
|
27
40
|
result = original_kickoff_for_each(self, *args, **kwargs)
|
|
28
41
|
|
|
29
42
|
return result
|
|
@@ -36,7 +49,13 @@ def wrap_crew_kickoff_async():
|
|
|
36
49
|
|
|
37
50
|
@wraps(original_kickoff_async)
|
|
38
51
|
async def wrapper(self, *args, **kwargs):
|
|
39
|
-
|
|
52
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
53
|
+
with Observer(
|
|
54
|
+
span_type="crew",
|
|
55
|
+
func_name="kickoff_async",
|
|
56
|
+
metric_collection=metric_collection,
|
|
57
|
+
metrics=metrics,
|
|
58
|
+
):
|
|
40
59
|
result = await original_kickoff_async(self, *args, **kwargs)
|
|
41
60
|
|
|
42
61
|
return result
|
|
@@ -49,7 +68,13 @@ def wrap_crew_kickoff_for_each_async():
|
|
|
49
68
|
|
|
50
69
|
@wraps(original_kickoff_for_each_async)
|
|
51
70
|
async def wrapper(self, *args, **kwargs):
|
|
52
|
-
|
|
71
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
72
|
+
with Observer(
|
|
73
|
+
span_type="crew",
|
|
74
|
+
func_name="kickoff_for_each_async",
|
|
75
|
+
metric_collection=metric_collection,
|
|
76
|
+
metrics=metrics,
|
|
77
|
+
):
|
|
53
78
|
result = await original_kickoff_for_each_async(
|
|
54
79
|
self, *args, **kwargs
|
|
55
80
|
)
|
|
@@ -64,10 +89,13 @@ def wrap_llm_call():
|
|
|
64
89
|
|
|
65
90
|
@wraps(original_llm_call)
|
|
66
91
|
def wrapper(self, *args, **kwargs):
|
|
92
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
67
93
|
with Observer(
|
|
68
94
|
span_type="llm",
|
|
69
95
|
func_name="call",
|
|
70
96
|
observe_kwargs={"model": "temp_model"},
|
|
97
|
+
metric_collection=metric_collection,
|
|
98
|
+
metrics=metrics,
|
|
71
99
|
):
|
|
72
100
|
result = original_llm_call(self, *args, **kwargs)
|
|
73
101
|
return result
|
|
@@ -80,8 +108,20 @@ def wrap_agent_execute_task():
|
|
|
80
108
|
|
|
81
109
|
@wraps(original_execute_task)
|
|
82
110
|
def wrapper(self, *args, **kwargs):
|
|
83
|
-
|
|
111
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
112
|
+
with Observer(
|
|
113
|
+
span_type="agent",
|
|
114
|
+
func_name="execute_task",
|
|
115
|
+
metric_collection=metric_collection,
|
|
116
|
+
metrics=metrics,
|
|
117
|
+
):
|
|
84
118
|
result = original_execute_task(self, *args, **kwargs)
|
|
85
119
|
return result
|
|
86
120
|
|
|
87
121
|
Agent.execute_task = wrapper
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _check_metrics_and_metric_collection(obj: Any):
|
|
125
|
+
metric_collection = getattr(obj, "_metric_collection", None)
|
|
126
|
+
metrics = getattr(obj, "_metrics", None)
|
|
127
|
+
return metric_collection, metrics
|
|
@@ -5,6 +5,10 @@ import uuid
|
|
|
5
5
|
from deepeval.telemetry import capture_tracing_integration
|
|
6
6
|
from deepeval.tracing import trace_manager
|
|
7
7
|
from deepeval.tracing.types import AgentSpan, BaseSpan, LlmSpan, TraceSpanStatus
|
|
8
|
+
from deepeval.tracing.trace_context import (
|
|
9
|
+
current_llm_context,
|
|
10
|
+
current_agent_context,
|
|
11
|
+
)
|
|
8
12
|
|
|
9
13
|
try:
|
|
10
14
|
from llama_index.core.instrumentation.events.base import BaseEvent
|
|
@@ -22,11 +26,6 @@ try:
|
|
|
22
26
|
LLMChatEndEvent,
|
|
23
27
|
)
|
|
24
28
|
from llama_index_instrumentation.dispatcher import Dispatcher
|
|
25
|
-
from deepeval.integrations.llama_index.agent.patched import (
|
|
26
|
-
FunctionAgent as PatchedFunctionAgent,
|
|
27
|
-
ReActAgent as PatchedReActAgent,
|
|
28
|
-
CodeActAgent as PatchedCodeActAgent,
|
|
29
|
-
)
|
|
30
29
|
from deepeval.integrations.llama_index.utils import (
|
|
31
30
|
parse_id,
|
|
32
31
|
prepare_input_llm_test_case_params,
|
|
@@ -67,6 +66,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
67
66
|
).strip()
|
|
68
67
|
input_messages.append({"role": role, "content": content})
|
|
69
68
|
|
|
69
|
+
llm_span_context = current_llm_context.get()
|
|
70
70
|
# create the span
|
|
71
71
|
llm_span = LlmSpan(
|
|
72
72
|
name="ConfidentLLMSpan",
|
|
@@ -83,6 +83,12 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
83
83
|
), # check the model name not coming in this option
|
|
84
84
|
input=input_messages,
|
|
85
85
|
output="",
|
|
86
|
+
metrics=llm_span_context.metrics if llm_span_context else None,
|
|
87
|
+
metric_collection=(
|
|
88
|
+
llm_span_context.metric_collection
|
|
89
|
+
if llm_span_context
|
|
90
|
+
else None
|
|
91
|
+
),
|
|
86
92
|
)
|
|
87
93
|
trace_manager.add_span(llm_span)
|
|
88
94
|
trace_manager.add_span_to_trace(llm_span)
|
|
@@ -144,6 +150,7 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
144
150
|
|
|
145
151
|
# conditions to qualify as agent start run span
|
|
146
152
|
if method_name == "run":
|
|
153
|
+
agent_span_context = current_agent_context.get()
|
|
147
154
|
span = AgentSpan(
|
|
148
155
|
uuid=id_,
|
|
149
156
|
status=TraceSpanStatus.IN_PROGRESS,
|
|
@@ -153,24 +160,16 @@ class LLamaIndexHandler(BaseEventHandler, BaseSpanHandler):
|
|
|
153
160
|
start_time=perf_counter(),
|
|
154
161
|
name="Agent", # TODO: decide the name of the span
|
|
155
162
|
input=bound_args.arguments,
|
|
163
|
+
metrics=(
|
|
164
|
+
agent_span_context.metrics if agent_span_context else None
|
|
165
|
+
),
|
|
166
|
+
metric_collection=(
|
|
167
|
+
agent_span_context.metric_collection
|
|
168
|
+
if agent_span_context
|
|
169
|
+
else None
|
|
170
|
+
),
|
|
156
171
|
)
|
|
157
172
|
|
|
158
|
-
# check if the instance is a PatchedFunctionAgent
|
|
159
|
-
if isinstance(instance, PatchedFunctionAgent):
|
|
160
|
-
span.name = "FunctionAgent"
|
|
161
|
-
span.metric_collection = instance.metric_collection
|
|
162
|
-
span.metrics = instance.metrics
|
|
163
|
-
|
|
164
|
-
if isinstance(instance, PatchedReActAgent):
|
|
165
|
-
span.name = "ReActAgent"
|
|
166
|
-
span.metric_collection = instance.metric_collection
|
|
167
|
-
span.metrics = instance.metrics
|
|
168
|
-
|
|
169
|
-
if isinstance(instance, PatchedCodeActAgent):
|
|
170
|
-
span.name = "CodeActAgent"
|
|
171
|
-
span.metric_collection = instance.metric_collection
|
|
172
|
-
span.metrics = instance.metrics
|
|
173
|
-
|
|
174
173
|
# prepare input test case params for the span
|
|
175
174
|
prepare_input_llm_test_case_params(
|
|
176
175
|
class_name, method_name, span, bound_args.arguments
|