deepeval 3.6.3__py3-none-any.whl → 3.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +13 -0
- deepeval/dataset/dataset.py +8 -2
- deepeval/evaluate/evaluate.py +8 -2
- deepeval/evaluate/execute.py +6 -11
- deepeval/evaluate/types.py +4 -1
- deepeval/evaluate/utils.py +46 -29
- deepeval/integrations/crewai/__init__.py +1 -2
- deepeval/integrations/crewai/handler.py +153 -81
- deepeval/integrations/crewai/wrapper.py +87 -0
- deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
- deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- deepeval/metrics/faithfulness/faithfulness.py +8 -0
- deepeval/prompt/prompt.py +133 -86
- deepeval/test_run/__init__.py +2 -1
- deepeval/test_run/api.py +1 -0
- deepeval/test_run/test_run.py +85 -9
- deepeval/tracing/__init__.py +2 -0
- deepeval/tracing/otel/test_exporter.py +35 -0
- deepeval/tracing/trace_context.py +14 -0
- deepeval/tracing/tracing.py +7 -6
- deepeval/tracing/utils.py +2 -86
- deepeval/utils.py +149 -1
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/METADATA +1 -1
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/RECORD +28 -26
- deepeval/integrations/crewai/agent.py +0 -98
- deepeval/integrations/crewai/patch.py +0 -41
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/WHEEL +0 -0
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.6.
|
|
1
|
+
__version__: str = "3.6.5"
|
deepeval/config/settings.py
CHANGED
|
@@ -180,6 +180,19 @@ class Settings(BaseSettings):
|
|
|
180
180
|
# into this directory. The directory will be created on demand.
|
|
181
181
|
DEEPEVAL_RESULTS_FOLDER: Optional[Path] = None
|
|
182
182
|
|
|
183
|
+
# Display / Truncation
|
|
184
|
+
DEEPEVAL_MAXLEN_TINY: Optional[int] = 40
|
|
185
|
+
DEEPEVAL_MAXLEN_SHORT: Optional[int] = 60
|
|
186
|
+
DEEPEVAL_MAXLEN_MEDIUM: Optional[int] = 120
|
|
187
|
+
DEEPEVAL_MAXLEN_LONG: Optional[int] = 240
|
|
188
|
+
|
|
189
|
+
# If set, this overrides the default max_len used by deepeval/utils shorten
|
|
190
|
+
# falls back to DEEPEVAL_MAXLEN_LONG when None.
|
|
191
|
+
DEEPEVAL_SHORTEN_DEFAULT_MAXLEN: Optional[int] = None
|
|
192
|
+
|
|
193
|
+
# Optional global suffix (keeps your "..." default).
|
|
194
|
+
DEEPEVAL_SHORTEN_SUFFIX: Optional[str] = "..."
|
|
195
|
+
|
|
183
196
|
#
|
|
184
197
|
# GPU and perf toggles
|
|
185
198
|
#
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -1266,11 +1266,17 @@ class EvaluationDataset:
|
|
|
1266
1266
|
detach(ctx_token)
|
|
1267
1267
|
|
|
1268
1268
|
else:
|
|
1269
|
-
|
|
1269
|
+
res = global_test_run_manager.wrap_up_test_run(
|
|
1270
1270
|
run_duration, display_table=False
|
|
1271
1271
|
)
|
|
1272
|
+
if isinstance(res, tuple):
|
|
1273
|
+
confident_link, test_run_id = res
|
|
1274
|
+
else:
|
|
1275
|
+
confident_link = test_run_id = None
|
|
1272
1276
|
return EvaluationResult(
|
|
1273
|
-
test_results=test_results,
|
|
1277
|
+
test_results=test_results,
|
|
1278
|
+
confident_link=confident_link,
|
|
1279
|
+
test_run_id=test_run_id,
|
|
1274
1280
|
)
|
|
1275
1281
|
|
|
1276
1282
|
def evaluate(self, task: Task):
|
deepeval/evaluate/evaluate.py
CHANGED
|
@@ -268,11 +268,17 @@ def evaluate(
|
|
|
268
268
|
test_run = global_test_run_manager.get_test_run()
|
|
269
269
|
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
270
270
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
271
|
-
|
|
271
|
+
res = global_test_run_manager.wrap_up_test_run(
|
|
272
272
|
run_duration, display_table=False
|
|
273
273
|
)
|
|
274
|
+
if isinstance(res, tuple):
|
|
275
|
+
confident_link, test_run_id = res
|
|
276
|
+
else:
|
|
277
|
+
confident_link = test_run_id = None
|
|
274
278
|
return EvaluationResult(
|
|
275
|
-
test_results=test_results,
|
|
279
|
+
test_results=test_results,
|
|
280
|
+
confident_link=confident_link,
|
|
281
|
+
test_run_id=test_run_id,
|
|
276
282
|
)
|
|
277
283
|
elif metric_collection:
|
|
278
284
|
api = Api()
|
deepeval/evaluate/execute.py
CHANGED
|
@@ -45,9 +45,7 @@ from deepeval.dataset import Golden
|
|
|
45
45
|
from deepeval.contextvars import set_current_golden, reset_current_golden
|
|
46
46
|
from deepeval.errors import MissingTestCaseParamsError
|
|
47
47
|
from deepeval.metrics.utils import copy_metrics
|
|
48
|
-
from deepeval.utils import
|
|
49
|
-
get_or_create_event_loop,
|
|
50
|
-
)
|
|
48
|
+
from deepeval.utils import get_or_create_event_loop, shorten, len_medium
|
|
51
49
|
from deepeval.telemetry import capture_evaluation_run
|
|
52
50
|
from deepeval.metrics import (
|
|
53
51
|
BaseMetric,
|
|
@@ -1802,14 +1800,11 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1802
1800
|
)
|
|
1803
1801
|
|
|
1804
1802
|
# record metadata for debugging
|
|
1805
|
-
MAX_META_INPUT_LENGTH = 120
|
|
1806
1803
|
started = time.perf_counter()
|
|
1807
|
-
short_input = current_golden_ctx
|
|
1808
|
-
if (
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
):
|
|
1812
|
-
short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
|
|
1804
|
+
short_input = current_golden_ctx.get("input")
|
|
1805
|
+
if isinstance(short_input, str):
|
|
1806
|
+
short_input = shorten(short_input, len_medium())
|
|
1807
|
+
|
|
1813
1808
|
task_meta[task] = {
|
|
1814
1809
|
"golden_index": current_golden_ctx["index"],
|
|
1815
1810
|
"golden_name": current_golden_ctx["name"],
|
|
@@ -1972,7 +1967,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1972
1967
|
|
|
1973
1968
|
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1974
1969
|
logger.warning(
|
|
1975
|
-
"[deepeval] %d stray task(s) not tracked; cancelling
|
|
1970
|
+
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
1976
1971
|
len(leftovers),
|
|
1977
1972
|
)
|
|
1978
1973
|
for t in leftovers:
|
deepeval/evaluate/types.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import Optional, List, Union, Dict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pydantic import BaseModel
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from deepeval.test_run.api import MetricData, TurnApi
|
|
5
6
|
from deepeval.test_case import MLLMImage
|
|
6
7
|
|
|
7
8
|
|
|
@@ -19,9 +20,11 @@ class TestResult:
|
|
|
19
20
|
expected_output: Optional[str] = None
|
|
20
21
|
context: Optional[List[str]] = None
|
|
21
22
|
retrieval_context: Optional[List[str]] = None
|
|
23
|
+
turns: Optional[List[TurnApi]] = None
|
|
22
24
|
additional_metadata: Optional[Dict] = None
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
class EvaluationResult(BaseModel):
|
|
26
28
|
test_results: List[TestResult]
|
|
27
29
|
confident_link: Optional[str]
|
|
30
|
+
test_run_id: Optional[str]
|
deepeval/evaluate/utils.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import inspect
|
|
3
|
-
from typing import Optional, List, Callable, Union
|
|
4
|
-
import os
|
|
5
|
-
|
|
3
|
+
from typing import Optional, List, Callable, Union
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
6
|
|
|
7
|
+
from deepeval.utils import format_turn
|
|
7
8
|
from deepeval.test_case.conversational_test_case import Turn
|
|
8
9
|
from deepeval.test_run.api import TurnApi
|
|
9
10
|
from deepeval.test_run.test_run import TestRunResultDisplay
|
|
@@ -34,6 +35,29 @@ from deepeval.tracing.utils import (
|
|
|
34
35
|
)
|
|
35
36
|
|
|
36
37
|
|
|
38
|
+
def _is_metric_successful(metric_data: MetricData) -> bool:
|
|
39
|
+
"""
|
|
40
|
+
Robustly determine success for a metric row.
|
|
41
|
+
|
|
42
|
+
Rationale:
|
|
43
|
+
- If the metric recorded an error, treat as failure.
|
|
44
|
+
- Be defensive: custom rows may not be MetricData at runtime.
|
|
45
|
+
"""
|
|
46
|
+
if getattr(metric_data, "error", None):
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
s = getattr(metric_data, "success", None)
|
|
50
|
+
if isinstance(s, bool):
|
|
51
|
+
return s
|
|
52
|
+
if s is None:
|
|
53
|
+
return False
|
|
54
|
+
if isinstance(s, (int, float)):
|
|
55
|
+
return bool(s)
|
|
56
|
+
if isinstance(s, str):
|
|
57
|
+
return s.strip().lower() in {"true", "t", "1", "yes", "y"}
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
|
|
37
61
|
def create_metric_data(metric: BaseMetric) -> MetricData:
|
|
38
62
|
if metric.error is not None:
|
|
39
63
|
return MetricData(
|
|
@@ -75,6 +99,7 @@ def create_test_result(
|
|
|
75
99
|
metrics_data=api_test_case.metrics_data,
|
|
76
100
|
conversational=True,
|
|
77
101
|
additional_metadata=api_test_case.additional_metadata,
|
|
102
|
+
turns=api_test_case.turns,
|
|
78
103
|
)
|
|
79
104
|
else:
|
|
80
105
|
multimodal = (
|
|
@@ -112,6 +137,7 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
|
112
137
|
return TurnApi(
|
|
113
138
|
role=turn.role,
|
|
114
139
|
content=turn.content,
|
|
140
|
+
user_id=turn.user_id,
|
|
115
141
|
retrievalContext=turn.retrieval_context,
|
|
116
142
|
toolsCalled=turn.tools_called,
|
|
117
143
|
additionalMetadata=turn.additional_metadata,
|
|
@@ -372,17 +398,7 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
|
|
|
372
398
|
print("Metrics Summary\n")
|
|
373
399
|
|
|
374
400
|
for metric_data in test_result.metrics_data:
|
|
375
|
-
successful =
|
|
376
|
-
if metric_data.error is not None:
|
|
377
|
-
successful = False
|
|
378
|
-
else:
|
|
379
|
-
# This try block is for user defined custom metrics,
|
|
380
|
-
# which might not handle the score == undefined case elegantly
|
|
381
|
-
try:
|
|
382
|
-
if not metric_data.success:
|
|
383
|
-
successful = False
|
|
384
|
-
except:
|
|
385
|
-
successful = False
|
|
401
|
+
successful = _is_metric_successful(metric_data)
|
|
386
402
|
|
|
387
403
|
if not successful:
|
|
388
404
|
print(
|
|
@@ -401,9 +417,14 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
|
|
|
401
417
|
|
|
402
418
|
elif test_result.conversational:
|
|
403
419
|
print("For conversational test case:\n")
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
420
|
+
if test_result.turns:
|
|
421
|
+
print(" Turns:")
|
|
422
|
+
turns = sorted(test_result.turns, key=lambda t: t.order)
|
|
423
|
+
for t in turns:
|
|
424
|
+
print(format_turn(t))
|
|
425
|
+
else:
|
|
426
|
+
print(" - No turns recorded in this test case.")
|
|
427
|
+
|
|
407
428
|
else:
|
|
408
429
|
print("For test case:\n")
|
|
409
430
|
print(f" - input: {test_result.input}")
|
|
@@ -470,15 +491,7 @@ def write_test_result_to_file(
|
|
|
470
491
|
file.write("Metrics Summary\n\n")
|
|
471
492
|
|
|
472
493
|
for metric_data in test_result.metrics_data:
|
|
473
|
-
successful =
|
|
474
|
-
if metric_data.error is not None:
|
|
475
|
-
successful = False
|
|
476
|
-
else:
|
|
477
|
-
try:
|
|
478
|
-
if not metric_data.success:
|
|
479
|
-
successful = False
|
|
480
|
-
except:
|
|
481
|
-
successful = False
|
|
494
|
+
successful = _is_metric_successful(metric_data)
|
|
482
495
|
|
|
483
496
|
if not successful:
|
|
484
497
|
file.write(
|
|
@@ -500,9 +513,13 @@ def write_test_result_to_file(
|
|
|
500
513
|
file.write(f" - actual output: {test_result.actual_output}\n")
|
|
501
514
|
elif test_result.conversational:
|
|
502
515
|
file.write("For conversational test case:\n\n")
|
|
503
|
-
|
|
504
|
-
"
|
|
505
|
-
|
|
516
|
+
if test_result.turns:
|
|
517
|
+
file.write(" Turns:\n")
|
|
518
|
+
turns = sorted(test_result.turns, key=lambda t: t.order)
|
|
519
|
+
for t in turns:
|
|
520
|
+
file.write(format_turn(t) + "\n")
|
|
521
|
+
else:
|
|
522
|
+
file.write(" - No turns recorded in this test case.\n")
|
|
506
523
|
else:
|
|
507
524
|
file.write("For test case:\n\n")
|
|
508
525
|
file.write(f" - input: {test_result.input}\n")
|
|
@@ -1,30 +1,50 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
2
|
import deepeval
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
agent_registry,
|
|
6
|
-
)
|
|
7
|
-
from deepeval.integrations.crewai.patch import patch_build_context_for_task
|
|
3
|
+
|
|
4
|
+
from typing import Optional
|
|
8
5
|
from deepeval.telemetry import capture_tracing_integration
|
|
9
|
-
from deepeval.tracing.
|
|
6
|
+
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
7
|
+
from deepeval.tracing.tracing import Observer
|
|
8
|
+
from deepeval.tracing.types import LlmSpan
|
|
9
|
+
from deepeval.config.settings import get_settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
try:
|
|
12
|
-
from crewai.crew import Crew
|
|
13
|
-
from crewai.llm import LLM
|
|
14
|
-
from crewai.agent import Agent
|
|
15
|
-
from crewai.utilities.events import AgentExecutionCompletedEvent
|
|
16
16
|
from crewai.utilities.events.base_event_listener import BaseEventListener
|
|
17
|
-
from crewai.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
from crewai.events import (
|
|
18
|
+
CrewKickoffStartedEvent,
|
|
19
|
+
CrewKickoffCompletedEvent,
|
|
20
|
+
LLMCallStartedEvent,
|
|
21
|
+
LLMCallCompletedEvent,
|
|
22
|
+
AgentExecutionStartedEvent,
|
|
23
|
+
AgentExecutionCompletedEvent,
|
|
24
|
+
ToolUsageStartedEvent,
|
|
25
|
+
ToolUsageFinishedEvent,
|
|
26
|
+
)
|
|
23
27
|
|
|
24
28
|
crewai_installed = True
|
|
25
|
-
except:
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
if get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
31
|
+
if isinstance(e, ModuleNotFoundError):
|
|
32
|
+
logger.warning(
|
|
33
|
+
"Optional crewai dependency not installed: %s",
|
|
34
|
+
e.name,
|
|
35
|
+
stacklevel=2,
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
logger.warning(
|
|
39
|
+
"Optional crewai import failed: %s",
|
|
40
|
+
e,
|
|
41
|
+
stacklevel=2,
|
|
42
|
+
)
|
|
43
|
+
|
|
26
44
|
crewai_installed = False
|
|
27
45
|
|
|
46
|
+
IS_WRAPPED_ALL = False
|
|
47
|
+
|
|
28
48
|
|
|
29
49
|
def is_crewai_installed():
|
|
30
50
|
if not crewai_installed:
|
|
@@ -33,81 +53,114 @@ def is_crewai_installed():
|
|
|
33
53
|
)
|
|
34
54
|
|
|
35
55
|
|
|
36
|
-
from deepeval.test_case.llm_test_case import LLMTestCase
|
|
37
|
-
from deepeval.tracing.tracing import (
|
|
38
|
-
observe,
|
|
39
|
-
current_span_context,
|
|
40
|
-
trace_manager,
|
|
41
|
-
current_trace_context,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
56
|
class CrewAIEventsListener(BaseEventListener):
|
|
46
57
|
def __init__(self):
|
|
47
58
|
is_crewai_installed()
|
|
48
59
|
super().__init__()
|
|
60
|
+
self.span_observers: dict[str, Observer] = {}
|
|
49
61
|
|
|
50
|
-
|
|
62
|
+
@staticmethod
|
|
63
|
+
def get_tool_execution_id(source, event) -> str:
|
|
64
|
+
source_id = id(source)
|
|
65
|
+
task_id = getattr(event, "task_id", "unknown")
|
|
66
|
+
agent_id = getattr(event, "agent_id", "unknown")
|
|
67
|
+
tool_name = getattr(event, "tool_name", "unknown")
|
|
68
|
+
execution_id = f"tool_{source_id}_{task_id}_{agent_id}_{tool_name}"
|
|
51
69
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
)
|
|
70
|
+
return execution_id
|
|
71
|
+
|
|
72
|
+
def setup_listeners(self, crewai_event_bus):
|
|
73
|
+
@crewai_event_bus.on(CrewKickoffStartedEvent)
|
|
74
|
+
def on_crew_started(source, event: CrewKickoffStartedEvent):
|
|
75
|
+
# Assuming that this event is called in the crew.kickoff method
|
|
56
76
|
current_span = current_span_context.get()
|
|
57
77
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
78
|
+
# set the input
|
|
79
|
+
if current_span:
|
|
80
|
+
current_span.input = event.inputs
|
|
81
|
+
|
|
82
|
+
# set trace input
|
|
83
|
+
current_trace = current_trace_context.get()
|
|
84
|
+
if current_trace:
|
|
85
|
+
current_trace.input = event.inputs
|
|
86
|
+
|
|
87
|
+
@crewai_event_bus.on(CrewKickoffCompletedEvent)
|
|
88
|
+
def on_crew_completed(source, event: CrewKickoffCompletedEvent):
|
|
89
|
+
# Assuming that this event is called in the crew.kickoff method
|
|
90
|
+
current_span = current_span_context.get()
|
|
64
91
|
|
|
92
|
+
# set the output
|
|
65
93
|
if current_span:
|
|
66
|
-
|
|
67
|
-
input = None
|
|
68
|
-
actual_output = None
|
|
69
|
-
expected_output = None
|
|
70
|
-
|
|
71
|
-
if isinstance(event.task, Task):
|
|
72
|
-
input = event.task.prompt()
|
|
73
|
-
actual_output = event.output
|
|
74
|
-
expected_output = event.task.expected_output
|
|
75
|
-
|
|
76
|
-
current_span.input = input
|
|
77
|
-
current_span.output = actual_output
|
|
78
|
-
current_span.expected_output = expected_output
|
|
79
|
-
|
|
80
|
-
# set metrics
|
|
81
|
-
if isinstance(source, PatchedAgent):
|
|
82
|
-
current_span.metrics = agent_registry.get_metrics(source)
|
|
83
|
-
current_span.metric_collection = (
|
|
84
|
-
agent_registry.get_metric_collection(source)
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
# set offline evals
|
|
88
|
-
if current_span.metric_collection:
|
|
89
|
-
trace_manager.integration_traces_to_evaluate.append(
|
|
90
|
-
current_trace_context.get()
|
|
91
|
-
)
|
|
94
|
+
current_span.output = str(event.output)
|
|
92
95
|
|
|
93
|
-
|
|
94
|
-
|
|
96
|
+
# set trace output
|
|
97
|
+
current_trace = current_trace_context.get()
|
|
98
|
+
if current_trace:
|
|
99
|
+
current_trace.output = str(event.output)
|
|
100
|
+
|
|
101
|
+
@crewai_event_bus.on(LLMCallStartedEvent)
|
|
102
|
+
def on_llm_started(source, event: LLMCallStartedEvent):
|
|
103
|
+
# Assuming that this event is called in the llm.call method
|
|
95
104
|
current_span = current_span_context.get()
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
current_span
|
|
105
|
+
|
|
106
|
+
# set the input
|
|
107
|
+
if current_span:
|
|
108
|
+
current_span.input = event.messages
|
|
109
|
+
|
|
110
|
+
# set the model
|
|
111
|
+
if isinstance(current_span, LlmSpan):
|
|
112
|
+
current_span.model = event.model
|
|
99
113
|
|
|
100
114
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
|
101
|
-
def
|
|
115
|
+
def on_llm_completed(source, event: LLMCallCompletedEvent):
|
|
116
|
+
# Assuming that this event is called in the llm.call method
|
|
102
117
|
current_span = current_span_context.get()
|
|
103
118
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
current_span.model = source.model
|
|
107
|
-
|
|
108
|
-
current_span.input = event.messages
|
|
119
|
+
# set the output
|
|
120
|
+
if current_span:
|
|
109
121
|
current_span.output = event.response
|
|
110
122
|
|
|
123
|
+
@crewai_event_bus.on(AgentExecutionStartedEvent)
|
|
124
|
+
def on_agent_started(source, event: AgentExecutionStartedEvent):
|
|
125
|
+
# Assuming that this event is called in the agent.execute_task method
|
|
126
|
+
current_span = current_span_context.get()
|
|
127
|
+
|
|
128
|
+
# set the input
|
|
129
|
+
if current_span:
|
|
130
|
+
current_span.input = event.task_prompt
|
|
131
|
+
|
|
132
|
+
@crewai_event_bus.on(AgentExecutionCompletedEvent)
|
|
133
|
+
def on_agent_completed(source, event: AgentExecutionCompletedEvent):
|
|
134
|
+
# Assuming that this event is called in the agent.execute_task method
|
|
135
|
+
current_span = current_span_context.get()
|
|
136
|
+
|
|
137
|
+
# set the output
|
|
138
|
+
if current_span:
|
|
139
|
+
current_span.output = event.output
|
|
140
|
+
|
|
141
|
+
@crewai_event_bus.on(ToolUsageStartedEvent)
|
|
142
|
+
def on_tool_started(source, event: ToolUsageStartedEvent):
|
|
143
|
+
observer = Observer(
|
|
144
|
+
span_type="tool",
|
|
145
|
+
func_name=event.tool_name,
|
|
146
|
+
function_kwargs=event.tool_args,
|
|
147
|
+
)
|
|
148
|
+
self.span_observers[self.get_tool_execution_id(source, event)] = (
|
|
149
|
+
observer
|
|
150
|
+
)
|
|
151
|
+
observer.__enter__()
|
|
152
|
+
|
|
153
|
+
@crewai_event_bus.on(ToolUsageFinishedEvent)
|
|
154
|
+
def on_tool_completed(source, event: ToolUsageFinishedEvent):
|
|
155
|
+
observer = self.span_observers.pop(
|
|
156
|
+
self.get_tool_execution_id(source, event)
|
|
157
|
+
)
|
|
158
|
+
if observer:
|
|
159
|
+
current_span = current_span_context.get()
|
|
160
|
+
if current_span:
|
|
161
|
+
current_span.output = event.output
|
|
162
|
+
observer.__exit__(None, None, None)
|
|
163
|
+
|
|
111
164
|
|
|
112
165
|
def instrument_crewai(api_key: Optional[str] = None):
|
|
113
166
|
is_crewai_installed()
|
|
@@ -115,10 +168,29 @@ def instrument_crewai(api_key: Optional[str] = None):
|
|
|
115
168
|
if api_key:
|
|
116
169
|
deepeval.login(api_key)
|
|
117
170
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
Agent.execute_task = observe(Agent.execute_task, type="agent")
|
|
121
|
-
CrewAgentExecutor.invoke = observe(CrewAgentExecutor.invoke)
|
|
122
|
-
ToolUsage.use = observe(ToolUsage.use, type="tool")
|
|
123
|
-
patch_build_context_for_task()
|
|
171
|
+
wrap_all()
|
|
172
|
+
|
|
124
173
|
CrewAIEventsListener()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def wrap_all():
|
|
177
|
+
global IS_WRAPPED_ALL
|
|
178
|
+
|
|
179
|
+
if not IS_WRAPPED_ALL:
|
|
180
|
+
from deepeval.integrations.crewai.wrapper import (
|
|
181
|
+
wrap_crew_kickoff,
|
|
182
|
+
wrap_crew_kickoff_for_each,
|
|
183
|
+
wrap_crew_kickoff_async,
|
|
184
|
+
wrap_crew_kickoff_for_each_async,
|
|
185
|
+
wrap_llm_call,
|
|
186
|
+
wrap_agent_execute_task,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
wrap_crew_kickoff()
|
|
190
|
+
wrap_crew_kickoff_for_each()
|
|
191
|
+
wrap_crew_kickoff_async()
|
|
192
|
+
wrap_crew_kickoff_for_each_async()
|
|
193
|
+
wrap_llm_call()
|
|
194
|
+
wrap_agent_execute_task()
|
|
195
|
+
|
|
196
|
+
IS_WRAPPED_ALL = True
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from crewai.llm import LLM
|
|
2
|
+
from crewai.crew import Crew
|
|
3
|
+
from crewai.agent import Agent
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from deepeval.tracing.tracing import Observer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def wrap_crew_kickoff():
|
|
9
|
+
original_kickoff = Crew.kickoff
|
|
10
|
+
|
|
11
|
+
@wraps(original_kickoff)
|
|
12
|
+
def wrapper(self, *args, **kwargs):
|
|
13
|
+
with Observer(span_type="crew", func_name="kickoff"):
|
|
14
|
+
result = original_kickoff(self, *args, **kwargs)
|
|
15
|
+
|
|
16
|
+
return result
|
|
17
|
+
|
|
18
|
+
Crew.kickoff = wrapper
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def wrap_crew_kickoff_for_each():
|
|
22
|
+
original_kickoff_for_each = Crew.kickoff_for_each
|
|
23
|
+
|
|
24
|
+
@wraps(original_kickoff_for_each)
|
|
25
|
+
def wrapper(self, *args, **kwargs):
|
|
26
|
+
with Observer(span_type="crew", func_name="kickoff_for_each"):
|
|
27
|
+
result = original_kickoff_for_each(self, *args, **kwargs)
|
|
28
|
+
|
|
29
|
+
return result
|
|
30
|
+
|
|
31
|
+
Crew.kickoff_for_each = wrapper
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def wrap_crew_kickoff_async():
|
|
35
|
+
original_kickoff_async = Crew.kickoff_async
|
|
36
|
+
|
|
37
|
+
@wraps(original_kickoff_async)
|
|
38
|
+
async def wrapper(self, *args, **kwargs):
|
|
39
|
+
with Observer(span_type="crew", func_name="kickoff_async"):
|
|
40
|
+
result = await original_kickoff_async(self, *args, **kwargs)
|
|
41
|
+
|
|
42
|
+
return result
|
|
43
|
+
|
|
44
|
+
Crew.kickoff_async = wrapper
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def wrap_crew_kickoff_for_each_async():
|
|
48
|
+
original_kickoff_for_each_async = Crew.kickoff_for_each_async
|
|
49
|
+
|
|
50
|
+
@wraps(original_kickoff_for_each_async)
|
|
51
|
+
async def wrapper(self, *args, **kwargs):
|
|
52
|
+
with Observer(span_type="crew", func_name="kickoff_for_each_async"):
|
|
53
|
+
result = await original_kickoff_for_each_async(
|
|
54
|
+
self, *args, **kwargs
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
Crew.kickoff_for_each_async = wrapper
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def wrap_llm_call():
|
|
63
|
+
original_llm_call = LLM.call
|
|
64
|
+
|
|
65
|
+
@wraps(original_llm_call)
|
|
66
|
+
def wrapper(self, *args, **kwargs):
|
|
67
|
+
with Observer(
|
|
68
|
+
span_type="llm",
|
|
69
|
+
func_name="call",
|
|
70
|
+
observe_kwargs={"model": "temp_model"},
|
|
71
|
+
):
|
|
72
|
+
result = original_llm_call(self, *args, **kwargs)
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
LLM.call = wrapper
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def wrap_agent_execute_task():
|
|
79
|
+
original_execute_task = Agent.execute_task
|
|
80
|
+
|
|
81
|
+
@wraps(original_execute_task)
|
|
82
|
+
def wrapper(self, *args, **kwargs):
|
|
83
|
+
with Observer(span_type="agent", func_name="execute_task"):
|
|
84
|
+
result = original_execute_task(self, *args, **kwargs)
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
Agent.execute_task = wrapper
|