deepeval 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +167 -12
- deepeval/dataset/dataset.py +8 -2
- deepeval/evaluate/evaluate.py +8 -2
- deepeval/evaluate/execute.py +28 -30
- deepeval/evaluate/types.py +4 -1
- deepeval/evaluate/utils.py +46 -29
- deepeval/integrations/crewai/__init__.py +1 -2
- deepeval/integrations/crewai/handler.py +153 -81
- deepeval/integrations/crewai/wrapper.py +87 -0
- deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
- deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- deepeval/metrics/faithfulness/faithfulness.py +8 -0
- deepeval/metrics/g_eval/g_eval.py +26 -15
- deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
- deepeval/models/retry_policy.py +202 -11
- deepeval/test_run/__init__.py +2 -1
- deepeval/test_run/api.py +1 -0
- deepeval/test_run/test_run.py +85 -9
- deepeval/tracing/__init__.py +2 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/test_exporter.py +35 -0
- deepeval/tracing/otel/utils.py +57 -7
- deepeval/tracing/trace_context.py +14 -0
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +7 -6
- deepeval/tracing/utils.py +2 -86
- deepeval/utils.py +149 -1
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/METADATA +1 -1
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/RECORD +35 -31
- deepeval/integrations/crewai/agent.py +0 -98
- deepeval/integrations/crewai/patch.py +0 -41
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/WHEEL +0 -0
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/utils.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import inspect
|
|
3
|
-
from typing import Optional, List, Callable, Union
|
|
4
|
-
import os
|
|
5
|
-
|
|
3
|
+
from typing import Optional, List, Callable, Union
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
6
|
|
|
7
|
+
from deepeval.utils import format_turn
|
|
7
8
|
from deepeval.test_case.conversational_test_case import Turn
|
|
8
9
|
from deepeval.test_run.api import TurnApi
|
|
9
10
|
from deepeval.test_run.test_run import TestRunResultDisplay
|
|
@@ -34,6 +35,29 @@ from deepeval.tracing.utils import (
|
|
|
34
35
|
)
|
|
35
36
|
|
|
36
37
|
|
|
38
|
+
def _is_metric_successful(metric_data: MetricData) -> bool:
|
|
39
|
+
"""
|
|
40
|
+
Robustly determine success for a metric row.
|
|
41
|
+
|
|
42
|
+
Rationale:
|
|
43
|
+
- If the metric recorded an error, treat as failure.
|
|
44
|
+
- Be defensive: custom rows may not be MetricData at runtime.
|
|
45
|
+
"""
|
|
46
|
+
if getattr(metric_data, "error", None):
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
s = getattr(metric_data, "success", None)
|
|
50
|
+
if isinstance(s, bool):
|
|
51
|
+
return s
|
|
52
|
+
if s is None:
|
|
53
|
+
return False
|
|
54
|
+
if isinstance(s, (int, float)):
|
|
55
|
+
return bool(s)
|
|
56
|
+
if isinstance(s, str):
|
|
57
|
+
return s.strip().lower() in {"true", "t", "1", "yes", "y"}
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
|
|
37
61
|
def create_metric_data(metric: BaseMetric) -> MetricData:
|
|
38
62
|
if metric.error is not None:
|
|
39
63
|
return MetricData(
|
|
@@ -75,6 +99,7 @@ def create_test_result(
|
|
|
75
99
|
metrics_data=api_test_case.metrics_data,
|
|
76
100
|
conversational=True,
|
|
77
101
|
additional_metadata=api_test_case.additional_metadata,
|
|
102
|
+
turns=api_test_case.turns,
|
|
78
103
|
)
|
|
79
104
|
else:
|
|
80
105
|
multimodal = (
|
|
@@ -112,6 +137,7 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
|
112
137
|
return TurnApi(
|
|
113
138
|
role=turn.role,
|
|
114
139
|
content=turn.content,
|
|
140
|
+
user_id=turn.user_id,
|
|
115
141
|
retrievalContext=turn.retrieval_context,
|
|
116
142
|
toolsCalled=turn.tools_called,
|
|
117
143
|
additionalMetadata=turn.additional_metadata,
|
|
@@ -372,17 +398,7 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
|
|
|
372
398
|
print("Metrics Summary\n")
|
|
373
399
|
|
|
374
400
|
for metric_data in test_result.metrics_data:
|
|
375
|
-
successful =
|
|
376
|
-
if metric_data.error is not None:
|
|
377
|
-
successful = False
|
|
378
|
-
else:
|
|
379
|
-
# This try block is for user defined custom metrics,
|
|
380
|
-
# which might not handle the score == undefined case elegantly
|
|
381
|
-
try:
|
|
382
|
-
if not metric_data.success:
|
|
383
|
-
successful = False
|
|
384
|
-
except:
|
|
385
|
-
successful = False
|
|
401
|
+
successful = _is_metric_successful(metric_data)
|
|
386
402
|
|
|
387
403
|
if not successful:
|
|
388
404
|
print(
|
|
@@ -401,9 +417,14 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
|
|
|
401
417
|
|
|
402
418
|
elif test_result.conversational:
|
|
403
419
|
print("For conversational test case:\n")
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
420
|
+
if test_result.turns:
|
|
421
|
+
print(" Turns:")
|
|
422
|
+
turns = sorted(test_result.turns, key=lambda t: t.order)
|
|
423
|
+
for t in turns:
|
|
424
|
+
print(format_turn(t))
|
|
425
|
+
else:
|
|
426
|
+
print(" - No turns recorded in this test case.")
|
|
427
|
+
|
|
407
428
|
else:
|
|
408
429
|
print("For test case:\n")
|
|
409
430
|
print(f" - input: {test_result.input}")
|
|
@@ -470,15 +491,7 @@ def write_test_result_to_file(
|
|
|
470
491
|
file.write("Metrics Summary\n\n")
|
|
471
492
|
|
|
472
493
|
for metric_data in test_result.metrics_data:
|
|
473
|
-
successful =
|
|
474
|
-
if metric_data.error is not None:
|
|
475
|
-
successful = False
|
|
476
|
-
else:
|
|
477
|
-
try:
|
|
478
|
-
if not metric_data.success:
|
|
479
|
-
successful = False
|
|
480
|
-
except:
|
|
481
|
-
successful = False
|
|
494
|
+
successful = _is_metric_successful(metric_data)
|
|
482
495
|
|
|
483
496
|
if not successful:
|
|
484
497
|
file.write(
|
|
@@ -500,9 +513,13 @@ def write_test_result_to_file(
|
|
|
500
513
|
file.write(f" - actual output: {test_result.actual_output}\n")
|
|
501
514
|
elif test_result.conversational:
|
|
502
515
|
file.write("For conversational test case:\n\n")
|
|
503
|
-
|
|
504
|
-
"
|
|
505
|
-
|
|
516
|
+
if test_result.turns:
|
|
517
|
+
file.write(" Turns:\n")
|
|
518
|
+
turns = sorted(test_result.turns, key=lambda t: t.order)
|
|
519
|
+
for t in turns:
|
|
520
|
+
file.write(format_turn(t) + "\n")
|
|
521
|
+
else:
|
|
522
|
+
file.write(" - No turns recorded in this test case.\n")
|
|
506
523
|
else:
|
|
507
524
|
file.write("For test case:\n\n")
|
|
508
525
|
file.write(f" - input: {test_result.input}\n")
|
|
@@ -1,30 +1,50 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
2
|
import deepeval
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
agent_registry,
|
|
6
|
-
)
|
|
7
|
-
from deepeval.integrations.crewai.patch import patch_build_context_for_task
|
|
3
|
+
|
|
4
|
+
from typing import Optional
|
|
8
5
|
from deepeval.telemetry import capture_tracing_integration
|
|
9
|
-
from deepeval.tracing.
|
|
6
|
+
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
7
|
+
from deepeval.tracing.tracing import Observer
|
|
8
|
+
from deepeval.tracing.types import LlmSpan
|
|
9
|
+
from deepeval.config.settings import get_settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
try:
|
|
12
|
-
from crewai.crew import Crew
|
|
13
|
-
from crewai.llm import LLM
|
|
14
|
-
from crewai.agent import Agent
|
|
15
|
-
from crewai.utilities.events import AgentExecutionCompletedEvent
|
|
16
16
|
from crewai.utilities.events.base_event_listener import BaseEventListener
|
|
17
|
-
from crewai.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
from crewai.events import (
|
|
18
|
+
CrewKickoffStartedEvent,
|
|
19
|
+
CrewKickoffCompletedEvent,
|
|
20
|
+
LLMCallStartedEvent,
|
|
21
|
+
LLMCallCompletedEvent,
|
|
22
|
+
AgentExecutionStartedEvent,
|
|
23
|
+
AgentExecutionCompletedEvent,
|
|
24
|
+
ToolUsageStartedEvent,
|
|
25
|
+
ToolUsageFinishedEvent,
|
|
26
|
+
)
|
|
23
27
|
|
|
24
28
|
crewai_installed = True
|
|
25
|
-
except:
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
if get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
31
|
+
if isinstance(e, ModuleNotFoundError):
|
|
32
|
+
logger.warning(
|
|
33
|
+
"Optional crewai dependency not installed: %s",
|
|
34
|
+
e.name,
|
|
35
|
+
stacklevel=2,
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
logger.warning(
|
|
39
|
+
"Optional crewai import failed: %s",
|
|
40
|
+
e,
|
|
41
|
+
stacklevel=2,
|
|
42
|
+
)
|
|
43
|
+
|
|
26
44
|
crewai_installed = False
|
|
27
45
|
|
|
46
|
+
IS_WRAPPED_ALL = False
|
|
47
|
+
|
|
28
48
|
|
|
29
49
|
def is_crewai_installed():
|
|
30
50
|
if not crewai_installed:
|
|
@@ -33,81 +53,114 @@ def is_crewai_installed():
|
|
|
33
53
|
)
|
|
34
54
|
|
|
35
55
|
|
|
36
|
-
from deepeval.test_case.llm_test_case import LLMTestCase
|
|
37
|
-
from deepeval.tracing.tracing import (
|
|
38
|
-
observe,
|
|
39
|
-
current_span_context,
|
|
40
|
-
trace_manager,
|
|
41
|
-
current_trace_context,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
56
|
class CrewAIEventsListener(BaseEventListener):
|
|
46
57
|
def __init__(self):
|
|
47
58
|
is_crewai_installed()
|
|
48
59
|
super().__init__()
|
|
60
|
+
self.span_observers: dict[str, Observer] = {}
|
|
49
61
|
|
|
50
|
-
|
|
62
|
+
@staticmethod
|
|
63
|
+
def get_tool_execution_id(source, event) -> str:
|
|
64
|
+
source_id = id(source)
|
|
65
|
+
task_id = getattr(event, "task_id", "unknown")
|
|
66
|
+
agent_id = getattr(event, "agent_id", "unknown")
|
|
67
|
+
tool_name = getattr(event, "tool_name", "unknown")
|
|
68
|
+
execution_id = f"tool_{source_id}_{task_id}_{agent_id}_{tool_name}"
|
|
51
69
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
)
|
|
70
|
+
return execution_id
|
|
71
|
+
|
|
72
|
+
def setup_listeners(self, crewai_event_bus):
|
|
73
|
+
@crewai_event_bus.on(CrewKickoffStartedEvent)
|
|
74
|
+
def on_crew_started(source, event: CrewKickoffStartedEvent):
|
|
75
|
+
# Assuming that this event is called in the crew.kickoff method
|
|
56
76
|
current_span = current_span_context.get()
|
|
57
77
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
78
|
+
# set the input
|
|
79
|
+
if current_span:
|
|
80
|
+
current_span.input = event.inputs
|
|
81
|
+
|
|
82
|
+
# set trace input
|
|
83
|
+
current_trace = current_trace_context.get()
|
|
84
|
+
if current_trace:
|
|
85
|
+
current_trace.input = event.inputs
|
|
86
|
+
|
|
87
|
+
@crewai_event_bus.on(CrewKickoffCompletedEvent)
|
|
88
|
+
def on_crew_completed(source, event: CrewKickoffCompletedEvent):
|
|
89
|
+
# Assuming that this event is called in the crew.kickoff method
|
|
90
|
+
current_span = current_span_context.get()
|
|
64
91
|
|
|
92
|
+
# set the output
|
|
65
93
|
if current_span:
|
|
66
|
-
|
|
67
|
-
input = None
|
|
68
|
-
actual_output = None
|
|
69
|
-
expected_output = None
|
|
70
|
-
|
|
71
|
-
if isinstance(event.task, Task):
|
|
72
|
-
input = event.task.prompt()
|
|
73
|
-
actual_output = event.output
|
|
74
|
-
expected_output = event.task.expected_output
|
|
75
|
-
|
|
76
|
-
current_span.input = input
|
|
77
|
-
current_span.output = actual_output
|
|
78
|
-
current_span.expected_output = expected_output
|
|
79
|
-
|
|
80
|
-
# set metrics
|
|
81
|
-
if isinstance(source, PatchedAgent):
|
|
82
|
-
current_span.metrics = agent_registry.get_metrics(source)
|
|
83
|
-
current_span.metric_collection = (
|
|
84
|
-
agent_registry.get_metric_collection(source)
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
# set offline evals
|
|
88
|
-
if current_span.metric_collection:
|
|
89
|
-
trace_manager.integration_traces_to_evaluate.append(
|
|
90
|
-
current_trace_context.get()
|
|
91
|
-
)
|
|
94
|
+
current_span.output = str(event.output)
|
|
92
95
|
|
|
93
|
-
|
|
94
|
-
|
|
96
|
+
# set trace output
|
|
97
|
+
current_trace = current_trace_context.get()
|
|
98
|
+
if current_trace:
|
|
99
|
+
current_trace.output = str(event.output)
|
|
100
|
+
|
|
101
|
+
@crewai_event_bus.on(LLMCallStartedEvent)
|
|
102
|
+
def on_llm_started(source, event: LLMCallStartedEvent):
|
|
103
|
+
# Assuming that this event is called in the llm.call method
|
|
95
104
|
current_span = current_span_context.get()
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
current_span
|
|
105
|
+
|
|
106
|
+
# set the input
|
|
107
|
+
if current_span:
|
|
108
|
+
current_span.input = event.messages
|
|
109
|
+
|
|
110
|
+
# set the model
|
|
111
|
+
if isinstance(current_span, LlmSpan):
|
|
112
|
+
current_span.model = event.model
|
|
99
113
|
|
|
100
114
|
@crewai_event_bus.on(LLMCallCompletedEvent)
|
|
101
|
-
def
|
|
115
|
+
def on_llm_completed(source, event: LLMCallCompletedEvent):
|
|
116
|
+
# Assuming that this event is called in the llm.call method
|
|
102
117
|
current_span = current_span_context.get()
|
|
103
118
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
current_span.model = source.model
|
|
107
|
-
|
|
108
|
-
current_span.input = event.messages
|
|
119
|
+
# set the output
|
|
120
|
+
if current_span:
|
|
109
121
|
current_span.output = event.response
|
|
110
122
|
|
|
123
|
+
@crewai_event_bus.on(AgentExecutionStartedEvent)
|
|
124
|
+
def on_agent_started(source, event: AgentExecutionStartedEvent):
|
|
125
|
+
# Assuming that this event is called in the agent.execute_task method
|
|
126
|
+
current_span = current_span_context.get()
|
|
127
|
+
|
|
128
|
+
# set the input
|
|
129
|
+
if current_span:
|
|
130
|
+
current_span.input = event.task_prompt
|
|
131
|
+
|
|
132
|
+
@crewai_event_bus.on(AgentExecutionCompletedEvent)
|
|
133
|
+
def on_agent_completed(source, event: AgentExecutionCompletedEvent):
|
|
134
|
+
# Assuming that this event is called in the agent.execute_task method
|
|
135
|
+
current_span = current_span_context.get()
|
|
136
|
+
|
|
137
|
+
# set the output
|
|
138
|
+
if current_span:
|
|
139
|
+
current_span.output = event.output
|
|
140
|
+
|
|
141
|
+
@crewai_event_bus.on(ToolUsageStartedEvent)
|
|
142
|
+
def on_tool_started(source, event: ToolUsageStartedEvent):
|
|
143
|
+
observer = Observer(
|
|
144
|
+
span_type="tool",
|
|
145
|
+
func_name=event.tool_name,
|
|
146
|
+
function_kwargs=event.tool_args,
|
|
147
|
+
)
|
|
148
|
+
self.span_observers[self.get_tool_execution_id(source, event)] = (
|
|
149
|
+
observer
|
|
150
|
+
)
|
|
151
|
+
observer.__enter__()
|
|
152
|
+
|
|
153
|
+
@crewai_event_bus.on(ToolUsageFinishedEvent)
|
|
154
|
+
def on_tool_completed(source, event: ToolUsageFinishedEvent):
|
|
155
|
+
observer = self.span_observers.pop(
|
|
156
|
+
self.get_tool_execution_id(source, event)
|
|
157
|
+
)
|
|
158
|
+
if observer:
|
|
159
|
+
current_span = current_span_context.get()
|
|
160
|
+
if current_span:
|
|
161
|
+
current_span.output = event.output
|
|
162
|
+
observer.__exit__(None, None, None)
|
|
163
|
+
|
|
111
164
|
|
|
112
165
|
def instrument_crewai(api_key: Optional[str] = None):
|
|
113
166
|
is_crewai_installed()
|
|
@@ -115,10 +168,29 @@ def instrument_crewai(api_key: Optional[str] = None):
|
|
|
115
168
|
if api_key:
|
|
116
169
|
deepeval.login(api_key)
|
|
117
170
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
Agent.execute_task = observe(Agent.execute_task, type="agent")
|
|
121
|
-
CrewAgentExecutor.invoke = observe(CrewAgentExecutor.invoke)
|
|
122
|
-
ToolUsage.use = observe(ToolUsage.use, type="tool")
|
|
123
|
-
patch_build_context_for_task()
|
|
171
|
+
wrap_all()
|
|
172
|
+
|
|
124
173
|
CrewAIEventsListener()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def wrap_all():
|
|
177
|
+
global IS_WRAPPED_ALL
|
|
178
|
+
|
|
179
|
+
if not IS_WRAPPED_ALL:
|
|
180
|
+
from deepeval.integrations.crewai.wrapper import (
|
|
181
|
+
wrap_crew_kickoff,
|
|
182
|
+
wrap_crew_kickoff_for_each,
|
|
183
|
+
wrap_crew_kickoff_async,
|
|
184
|
+
wrap_crew_kickoff_for_each_async,
|
|
185
|
+
wrap_llm_call,
|
|
186
|
+
wrap_agent_execute_task,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
wrap_crew_kickoff()
|
|
190
|
+
wrap_crew_kickoff_for_each()
|
|
191
|
+
wrap_crew_kickoff_async()
|
|
192
|
+
wrap_crew_kickoff_for_each_async()
|
|
193
|
+
wrap_llm_call()
|
|
194
|
+
wrap_agent_execute_task()
|
|
195
|
+
|
|
196
|
+
IS_WRAPPED_ALL = True
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from crewai.llm import LLM
|
|
2
|
+
from crewai.crew import Crew
|
|
3
|
+
from crewai.agent import Agent
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from deepeval.tracing.tracing import Observer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def wrap_crew_kickoff():
|
|
9
|
+
original_kickoff = Crew.kickoff
|
|
10
|
+
|
|
11
|
+
@wraps(original_kickoff)
|
|
12
|
+
def wrapper(self, *args, **kwargs):
|
|
13
|
+
with Observer(span_type="crew", func_name="kickoff"):
|
|
14
|
+
result = original_kickoff(self, *args, **kwargs)
|
|
15
|
+
|
|
16
|
+
return result
|
|
17
|
+
|
|
18
|
+
Crew.kickoff = wrapper
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def wrap_crew_kickoff_for_each():
|
|
22
|
+
original_kickoff_for_each = Crew.kickoff_for_each
|
|
23
|
+
|
|
24
|
+
@wraps(original_kickoff_for_each)
|
|
25
|
+
def wrapper(self, *args, **kwargs):
|
|
26
|
+
with Observer(span_type="crew", func_name="kickoff_for_each"):
|
|
27
|
+
result = original_kickoff_for_each(self, *args, **kwargs)
|
|
28
|
+
|
|
29
|
+
return result
|
|
30
|
+
|
|
31
|
+
Crew.kickoff_for_each = wrapper
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def wrap_crew_kickoff_async():
|
|
35
|
+
original_kickoff_async = Crew.kickoff_async
|
|
36
|
+
|
|
37
|
+
@wraps(original_kickoff_async)
|
|
38
|
+
async def wrapper(self, *args, **kwargs):
|
|
39
|
+
with Observer(span_type="crew", func_name="kickoff_async"):
|
|
40
|
+
result = await original_kickoff_async(self, *args, **kwargs)
|
|
41
|
+
|
|
42
|
+
return result
|
|
43
|
+
|
|
44
|
+
Crew.kickoff_async = wrapper
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def wrap_crew_kickoff_for_each_async():
|
|
48
|
+
original_kickoff_for_each_async = Crew.kickoff_for_each_async
|
|
49
|
+
|
|
50
|
+
@wraps(original_kickoff_for_each_async)
|
|
51
|
+
async def wrapper(self, *args, **kwargs):
|
|
52
|
+
with Observer(span_type="crew", func_name="kickoff_for_each_async"):
|
|
53
|
+
result = await original_kickoff_for_each_async(
|
|
54
|
+
self, *args, **kwargs
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
Crew.kickoff_for_each_async = wrapper
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def wrap_llm_call():
|
|
63
|
+
original_llm_call = LLM.call
|
|
64
|
+
|
|
65
|
+
@wraps(original_llm_call)
|
|
66
|
+
def wrapper(self, *args, **kwargs):
|
|
67
|
+
with Observer(
|
|
68
|
+
span_type="llm",
|
|
69
|
+
func_name="call",
|
|
70
|
+
observe_kwargs={"model": "temp_model"},
|
|
71
|
+
):
|
|
72
|
+
result = original_llm_call(self, *args, **kwargs)
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
LLM.call = wrapper
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def wrap_agent_execute_task():
|
|
79
|
+
original_execute_task = Agent.execute_task
|
|
80
|
+
|
|
81
|
+
@wraps(original_execute_task)
|
|
82
|
+
def wrapper(self, *args, **kwargs):
|
|
83
|
+
with Observer(span_type="agent", func_name="execute_task"):
|
|
84
|
+
result = original_execute_task(self, *args, **kwargs)
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
Agent.execute_task = wrapper
|
|
@@ -1,7 +1,19 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
import os
|
|
3
4
|
from typing import Literal, Optional, List
|
|
4
5
|
|
|
6
|
+
from deepeval.config.settings import get_settings
|
|
7
|
+
from deepeval.confident.api import get_confident_api_key
|
|
8
|
+
from deepeval.prompt import Prompt
|
|
9
|
+
from deepeval.tracing.context import current_trace_context
|
|
10
|
+
from deepeval.tracing.types import Trace
|
|
11
|
+
from deepeval.tracing.otel.utils import to_hex_string
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
5
17
|
try:
|
|
6
18
|
from pydantic_ai.models.instrumented import InstrumentationSettings
|
|
7
19
|
from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
|
|
@@ -11,7 +23,20 @@ try:
|
|
|
11
23
|
)
|
|
12
24
|
|
|
13
25
|
dependency_installed = True
|
|
14
|
-
except:
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
if get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
28
|
+
if isinstance(e, ModuleNotFoundError):
|
|
29
|
+
logger.warning(
|
|
30
|
+
"Optional tracing dependency not installed: %s",
|
|
31
|
+
e.name,
|
|
32
|
+
stacklevel=2,
|
|
33
|
+
)
|
|
34
|
+
else:
|
|
35
|
+
logger.warning(
|
|
36
|
+
"Optional tracing import failed: %s",
|
|
37
|
+
e,
|
|
38
|
+
stacklevel=2,
|
|
39
|
+
)
|
|
15
40
|
dependency_installed = False
|
|
16
41
|
|
|
17
42
|
|
|
@@ -25,6 +50,10 @@ def is_dependency_installed():
|
|
|
25
50
|
|
|
26
51
|
from deepeval.confident.api import get_confident_api_key
|
|
27
52
|
from deepeval.prompt import Prompt
|
|
53
|
+
from deepeval.tracing.otel.test_exporter import test_exporter
|
|
54
|
+
from deepeval.tracing.context import current_trace_context
|
|
55
|
+
from deepeval.tracing.types import Trace
|
|
56
|
+
from deepeval.tracing.otel.utils import to_hex_string
|
|
28
57
|
|
|
29
58
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
30
59
|
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
@@ -37,6 +66,12 @@ class SpanInterceptor(SpanProcessor):
|
|
|
37
66
|
|
|
38
67
|
def on_start(self, span, parent_context):
|
|
39
68
|
|
|
69
|
+
# set trace uuid
|
|
70
|
+
_current_trace_context = current_trace_context.get()
|
|
71
|
+
if _current_trace_context and isinstance(_current_trace_context, Trace):
|
|
72
|
+
_otel_trace_id = span.get_span_context().trace_id
|
|
73
|
+
_current_trace_context.uuid = to_hex_string(_otel_trace_id, 32)
|
|
74
|
+
|
|
40
75
|
# set trace attributes
|
|
41
76
|
if self.settings.thread_id:
|
|
42
77
|
span.set_attribute(
|
|
@@ -148,8 +183,9 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
148
183
|
confident_prompt: Optional[Prompt] = None,
|
|
149
184
|
llm_metric_collection: Optional[str] = None,
|
|
150
185
|
agent_metric_collection: Optional[str] = None,
|
|
151
|
-
tool_metric_collection_map: dict =
|
|
186
|
+
tool_metric_collection_map: Optional[dict] = None,
|
|
152
187
|
trace_metric_collection: Optional[str] = None,
|
|
188
|
+
is_test_mode: Optional[bool] = False,
|
|
153
189
|
):
|
|
154
190
|
is_dependency_installed()
|
|
155
191
|
|
|
@@ -162,7 +198,7 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
162
198
|
]:
|
|
163
199
|
self.environment = _environment
|
|
164
200
|
|
|
165
|
-
self.tool_metric_collection_map = tool_metric_collection_map
|
|
201
|
+
self.tool_metric_collection_map = tool_metric_collection_map or {}
|
|
166
202
|
self.name = name
|
|
167
203
|
self.thread_id = thread_id
|
|
168
204
|
self.user_id = user_id
|
|
@@ -185,12 +221,15 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
185
221
|
span_interceptor = SpanInterceptor(self)
|
|
186
222
|
trace_provider.add_span_processor(span_interceptor)
|
|
187
223
|
|
|
188
|
-
|
|
189
|
-
BatchSpanProcessor(
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
224
|
+
if is_test_mode:
|
|
225
|
+
trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
|
|
226
|
+
else:
|
|
227
|
+
trace_provider.add_span_processor(
|
|
228
|
+
BatchSpanProcessor(
|
|
229
|
+
OTLPSpanExporter(
|
|
230
|
+
endpoint=OTLP_ENDPOINT,
|
|
231
|
+
headers={"x-confident-api-key": api_key},
|
|
232
|
+
)
|
|
193
233
|
)
|
|
194
234
|
)
|
|
195
|
-
)
|
|
196
235
|
super().__init__(tracer_provider=trace_provider)
|
|
File without changes
|
|
@@ -41,6 +41,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
41
41
|
strict_mode: bool = False,
|
|
42
42
|
verbose_mode: bool = False,
|
|
43
43
|
truths_extraction_limit: Optional[int] = None,
|
|
44
|
+
penalize_ambiguous_claims: bool = False,
|
|
44
45
|
evaluation_template: Type[FaithfulnessTemplate] = FaithfulnessTemplate,
|
|
45
46
|
):
|
|
46
47
|
self.threshold = 1 if strict_mode else threshold
|
|
@@ -51,6 +52,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
51
52
|
self.strict_mode = strict_mode
|
|
52
53
|
self.verbose_mode = verbose_mode
|
|
53
54
|
self.evaluation_template = evaluation_template
|
|
55
|
+
self.penalize_ambiguous_claims = penalize_ambiguous_claims
|
|
54
56
|
|
|
55
57
|
self.truths_extraction_limit = truths_extraction_limit
|
|
56
58
|
if self.truths_extraction_limit is not None:
|
|
@@ -329,6 +331,12 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
329
331
|
if verdict.verdict.strip().lower() != "no":
|
|
330
332
|
faithfulness_count += 1
|
|
331
333
|
|
|
334
|
+
if (
|
|
335
|
+
self.penalize_ambiguous_claims
|
|
336
|
+
and verdict.verdict.strip().lower() == "idk"
|
|
337
|
+
):
|
|
338
|
+
faithfulness_count -= 1
|
|
339
|
+
|
|
332
340
|
score = faithfulness_count / number_of_verdicts
|
|
333
341
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
334
342
|
|