deepeval 3.5.9__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings_manager.py +1 -1
- deepeval/contextvars.py +25 -0
- deepeval/dataset/__init__.py +8 -2
- deepeval/evaluate/execute.py +15 -3
- deepeval/openai_agents/__init__.py +4 -3
- deepeval/openai_agents/agent.py +8 -166
- deepeval/openai_agents/callback_handler.py +63 -62
- deepeval/openai_agents/extractors.py +83 -7
- deepeval/openai_agents/patch.py +255 -61
- deepeval/openai_agents/runner.py +348 -335
- deepeval/tracing/context.py +1 -0
- deepeval/tracing/tracing.py +3 -0
- deepeval/utils.py +4 -3
- {deepeval-3.5.9.dist-info → deepeval-3.6.0.dist-info}/METADATA +1 -1
- {deepeval-3.5.9.dist-info → deepeval-3.6.0.dist-info}/RECORD +19 -18
- {deepeval-3.5.9.dist-info → deepeval-3.6.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.5.9.dist-info → deepeval-3.6.0.dist-info}/WHEEL +0 -0
- {deepeval-3.5.9.dist-info → deepeval-3.6.0.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.
|
|
1
|
+
__version__: str = "3.6.0"
|
|
@@ -15,7 +15,7 @@ from enum import Enum
|
|
|
15
15
|
from pydantic import SecretStr
|
|
16
16
|
from deepeval.config.settings import get_settings, _SAVE_RE
|
|
17
17
|
from deepeval.cli.dotenv_handler import DotenvHandler
|
|
18
|
-
from deepeval.utils import bool_to_env_str
|
|
18
|
+
from deepeval.config.utils import bool_to_env_str
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
StrOrEnum = Union[str, Enum]
|
deepeval/contextvars.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextvars import ContextVar
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from deepeval.dataset.golden import Golden
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
CURRENT_GOLDEN: ContextVar[Optional[Golden]] = ContextVar(
|
|
12
|
+
"CURRENT_GOLDEN", default=None
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def set_current_golden(golden: Optional[Golden]):
|
|
17
|
+
return CURRENT_GOLDEN.set(golden)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_current_golden() -> Optional[Golden]:
|
|
21
|
+
return CURRENT_GOLDEN.get()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def reset_current_golden(token) -> None:
|
|
25
|
+
CURRENT_GOLDEN.reset(token)
|
deepeval/dataset/__init__.py
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
|
+
from deepeval.contextvars import get_current_golden
|
|
1
2
|
from .dataset import EvaluationDataset
|
|
2
3
|
from .golden import Golden, ConversationalGolden
|
|
3
|
-
from .test_run_tracer import init_global_test_run_tracer
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"EvaluationDataset",
|
|
8
|
+
"Golden",
|
|
9
|
+
"ConversationalGolden",
|
|
10
|
+
"get_current_golden",
|
|
11
|
+
]
|
deepeval/evaluate/execute.py
CHANGED
|
@@ -42,6 +42,7 @@ from deepeval.tracing.api import (
|
|
|
42
42
|
BaseApiSpan,
|
|
43
43
|
)
|
|
44
44
|
from deepeval.dataset import Golden
|
|
45
|
+
from deepeval.contextvars import set_current_golden, reset_current_golden
|
|
45
46
|
from deepeval.errors import MissingTestCaseParamsError
|
|
46
47
|
from deepeval.metrics.utils import copy_metrics
|
|
47
48
|
from deepeval.utils import (
|
|
@@ -1480,6 +1481,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1480
1481
|
)
|
|
1481
1482
|
|
|
1482
1483
|
for golden in goldens:
|
|
1484
|
+
token = set_current_golden(golden)
|
|
1483
1485
|
with capture_evaluation_run("golden"):
|
|
1484
1486
|
# yield golden
|
|
1485
1487
|
count += 1
|
|
@@ -1492,8 +1494,14 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1492
1494
|
_progress=progress,
|
|
1493
1495
|
_pbar_callback_id=pbar_tags_id,
|
|
1494
1496
|
):
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
+
try:
|
|
1498
|
+
# yield golden to user code
|
|
1499
|
+
yield golden
|
|
1500
|
+
# control has returned from user code without error, capture trace now
|
|
1501
|
+
current_trace: Trace = current_trace_context.get()
|
|
1502
|
+
finally:
|
|
1503
|
+
# after user code returns control, always reset the context
|
|
1504
|
+
reset_current_golden(token)
|
|
1497
1505
|
|
|
1498
1506
|
update_pbar(progress, pbar_tags_id)
|
|
1499
1507
|
update_pbar(progress, pbar_id)
|
|
@@ -1849,6 +1857,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1849
1857
|
|
|
1850
1858
|
try:
|
|
1851
1859
|
for index, golden in enumerate(goldens):
|
|
1860
|
+
token = set_current_golden(golden)
|
|
1852
1861
|
current_golden_ctx.update(
|
|
1853
1862
|
{
|
|
1854
1863
|
"index": index,
|
|
@@ -1857,7 +1866,10 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1857
1866
|
}
|
|
1858
1867
|
)
|
|
1859
1868
|
prev_task_length = len(created_tasks)
|
|
1860
|
-
|
|
1869
|
+
try:
|
|
1870
|
+
yield golden
|
|
1871
|
+
finally:
|
|
1872
|
+
reset_current_golden(token)
|
|
1861
1873
|
# if this golden created no tasks, bump bars now
|
|
1862
1874
|
if len(created_tasks) == prev_task_length:
|
|
1863
1875
|
update_pbar(progress, pbar_callback_id)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from deepeval.openai_agents.callback_handler import DeepEvalTracingProcessor
|
|
2
|
-
from deepeval.openai_agents.runner import Runner
|
|
3
|
-
from deepeval.openai_agents.patch import function_tool
|
|
4
2
|
from deepeval.openai_agents.agent import DeepEvalAgent as Agent
|
|
3
|
+
from deepeval.openai_agents.patch import function_tool
|
|
4
|
+
|
|
5
|
+
# from deepeval.openai_agents.runner import Runner
|
|
5
6
|
|
|
6
|
-
__all__ = ["DeepEvalTracingProcessor", "
|
|
7
|
+
__all__ = ["DeepEvalTracingProcessor", "Agent", "function_tool"]
|
deepeval/openai_agents/agent.py
CHANGED
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from typing import
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Generic, TypeVar, List
|
|
5
5
|
|
|
6
|
-
from deepeval.tracing import observe
|
|
7
6
|
from deepeval.prompt import Prompt
|
|
8
|
-
from deepeval.tracing.tracing import Observer
|
|
9
7
|
from deepeval.metrics import BaseMetric
|
|
10
|
-
from deepeval.tracing.utils import make_json_serializable
|
|
11
8
|
from deepeval.tracing.types import LlmSpan
|
|
12
|
-
from deepeval.tracing.context import current_span_context
|
|
13
9
|
|
|
14
10
|
try:
|
|
15
11
|
from agents.agent import Agent as BaseAgent
|
|
16
|
-
from
|
|
17
|
-
|
|
12
|
+
from deepeval.openai_agents.patch import (
|
|
13
|
+
patch_default_agent_runner_get_model,
|
|
14
|
+
)
|
|
18
15
|
except Exception as e:
|
|
19
16
|
raise RuntimeError(
|
|
20
17
|
"openai-agents is required for this integration. Please install it."
|
|
@@ -23,163 +20,6 @@ except Exception as e:
|
|
|
23
20
|
TContext = TypeVar("TContext")
|
|
24
21
|
|
|
25
22
|
|
|
26
|
-
class _ObservedModel(Model):
|
|
27
|
-
def __init__(
|
|
28
|
-
self,
|
|
29
|
-
inner: Model,
|
|
30
|
-
llm_metric_collection: str = None,
|
|
31
|
-
llm_metrics: List[BaseMetric] = None,
|
|
32
|
-
confident_prompt: Prompt = None,
|
|
33
|
-
) -> None:
|
|
34
|
-
self._inner = inner
|
|
35
|
-
self._llm_metric_collection = llm_metric_collection
|
|
36
|
-
self._llm_metrics = llm_metrics
|
|
37
|
-
self._confident_prompt = confident_prompt
|
|
38
|
-
|
|
39
|
-
def __getattr__(self, name: str) -> Any:
|
|
40
|
-
return getattr(self._inner, name)
|
|
41
|
-
|
|
42
|
-
def _get_model_name(self) -> str:
|
|
43
|
-
try:
|
|
44
|
-
for attr in ("model", "model_name", "name"):
|
|
45
|
-
if hasattr(self._inner, attr):
|
|
46
|
-
val = getattr(self._inner, attr)
|
|
47
|
-
if val is not None:
|
|
48
|
-
return str(val)
|
|
49
|
-
except Exception:
|
|
50
|
-
pass
|
|
51
|
-
return "unknown"
|
|
52
|
-
|
|
53
|
-
async def get_response(
|
|
54
|
-
self,
|
|
55
|
-
system_instructions,
|
|
56
|
-
input,
|
|
57
|
-
model_settings,
|
|
58
|
-
tools,
|
|
59
|
-
output_schema,
|
|
60
|
-
handoffs,
|
|
61
|
-
tracing,
|
|
62
|
-
*,
|
|
63
|
-
previous_response_id,
|
|
64
|
-
conversation_id,
|
|
65
|
-
prompt,
|
|
66
|
-
**kwargs,
|
|
67
|
-
):
|
|
68
|
-
model_name = self._get_model_name()
|
|
69
|
-
with Observer(
|
|
70
|
-
span_type="llm",
|
|
71
|
-
func_name="LLM",
|
|
72
|
-
function_kwargs={
|
|
73
|
-
"system_instructions": system_instructions,
|
|
74
|
-
"input": input,
|
|
75
|
-
"model_settings": model_settings,
|
|
76
|
-
"tools": tools,
|
|
77
|
-
"output_schema": output_schema,
|
|
78
|
-
"handoffs": handoffs,
|
|
79
|
-
# "tracing": tracing, # not important for llm spans
|
|
80
|
-
# "previous_response_id": previous_response_id, # not important for llm spans
|
|
81
|
-
# "conversation_id": conversation_id, # not important for llm spans
|
|
82
|
-
"prompt": prompt,
|
|
83
|
-
**kwargs,
|
|
84
|
-
},
|
|
85
|
-
observe_kwargs={"model": model_name},
|
|
86
|
-
metrics=self._llm_metrics,
|
|
87
|
-
metric_collection=self._llm_metric_collection,
|
|
88
|
-
) as observer:
|
|
89
|
-
result = await self._inner.get_response(
|
|
90
|
-
system_instructions,
|
|
91
|
-
input,
|
|
92
|
-
model_settings,
|
|
93
|
-
tools,
|
|
94
|
-
output_schema,
|
|
95
|
-
handoffs,
|
|
96
|
-
tracing,
|
|
97
|
-
previous_response_id=previous_response_id,
|
|
98
|
-
conversation_id=conversation_id,
|
|
99
|
-
prompt=prompt,
|
|
100
|
-
**kwargs,
|
|
101
|
-
)
|
|
102
|
-
llm_span: LlmSpan = current_span_context.get()
|
|
103
|
-
llm_span.prompt = self._confident_prompt
|
|
104
|
-
|
|
105
|
-
observer.result = make_json_serializable(result.output)
|
|
106
|
-
|
|
107
|
-
return result
|
|
108
|
-
|
|
109
|
-
def stream_response(
|
|
110
|
-
self,
|
|
111
|
-
system_instructions,
|
|
112
|
-
input,
|
|
113
|
-
model_settings,
|
|
114
|
-
tools,
|
|
115
|
-
output_schema,
|
|
116
|
-
handoffs,
|
|
117
|
-
tracing,
|
|
118
|
-
*,
|
|
119
|
-
previous_response_id,
|
|
120
|
-
conversation_id,
|
|
121
|
-
prompt,
|
|
122
|
-
**kwargs,
|
|
123
|
-
):
|
|
124
|
-
model_name = self._get_model_name()
|
|
125
|
-
|
|
126
|
-
async def _gen():
|
|
127
|
-
observer = Observer(
|
|
128
|
-
span_type="llm",
|
|
129
|
-
func_name="LLM",
|
|
130
|
-
function_kwargs={
|
|
131
|
-
"system_instructions": system_instructions,
|
|
132
|
-
"input": input,
|
|
133
|
-
"model_settings": model_settings,
|
|
134
|
-
"tools": tools,
|
|
135
|
-
"output_schema": output_schema,
|
|
136
|
-
"handoffs": handoffs,
|
|
137
|
-
# "tracing": tracing,
|
|
138
|
-
# "previous_response_id": previous_response_id,
|
|
139
|
-
# "conversation_id": conversation_id,
|
|
140
|
-
"prompt": prompt,
|
|
141
|
-
**kwargs,
|
|
142
|
-
},
|
|
143
|
-
observe_kwargs={"model": model_name},
|
|
144
|
-
metrics=self._llm_metrics,
|
|
145
|
-
metric_collection=self._llm_metric_collection,
|
|
146
|
-
)
|
|
147
|
-
observer.__enter__()
|
|
148
|
-
|
|
149
|
-
llm_span: LlmSpan = current_span_context.get()
|
|
150
|
-
llm_span.prompt = self._confident_prompt
|
|
151
|
-
|
|
152
|
-
try:
|
|
153
|
-
async for event in self._inner.stream_response(
|
|
154
|
-
system_instructions,
|
|
155
|
-
input,
|
|
156
|
-
model_settings,
|
|
157
|
-
tools,
|
|
158
|
-
output_schema,
|
|
159
|
-
handoffs,
|
|
160
|
-
tracing,
|
|
161
|
-
previous_response_id=previous_response_id,
|
|
162
|
-
conversation_id=conversation_id,
|
|
163
|
-
prompt=prompt,
|
|
164
|
-
):
|
|
165
|
-
|
|
166
|
-
if isinstance(event, ResponseCompletedEvent):
|
|
167
|
-
observer.result = make_json_serializable(
|
|
168
|
-
event.response.output
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
yield event
|
|
172
|
-
|
|
173
|
-
except Exception as e:
|
|
174
|
-
observer.__exit__(type(e), e, e.__traceback__)
|
|
175
|
-
raise
|
|
176
|
-
finally:
|
|
177
|
-
|
|
178
|
-
observer.__exit__(None, None, None)
|
|
179
|
-
|
|
180
|
-
return _gen()
|
|
181
|
-
|
|
182
|
-
|
|
183
23
|
@dataclass
|
|
184
24
|
class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
|
|
185
25
|
"""
|
|
@@ -189,6 +29,8 @@ class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
|
|
|
189
29
|
llm_metric_collection: str = None
|
|
190
30
|
llm_metrics: List[BaseMetric] = None
|
|
191
31
|
confident_prompt: Prompt = None
|
|
32
|
+
agent_metrics: List[BaseMetric] = None
|
|
33
|
+
agent_metric_collection: str = None
|
|
192
34
|
|
|
193
35
|
def __post_init__(self):
|
|
194
|
-
|
|
36
|
+
patch_default_agent_runner_get_model()
|
|
@@ -21,6 +21,10 @@ try:
|
|
|
21
21
|
ResponseSpanData,
|
|
22
22
|
SpanData,
|
|
23
23
|
)
|
|
24
|
+
from deepeval.openai_agents.patch import (
|
|
25
|
+
patch_default_agent_run_single_turn,
|
|
26
|
+
patch_default_agent_run_single_turn_streamed,
|
|
27
|
+
)
|
|
24
28
|
|
|
25
29
|
openai_agents_available = True
|
|
26
30
|
except ImportError:
|
|
@@ -37,6 +41,8 @@ def _check_openai_agents_available():
|
|
|
37
41
|
class DeepEvalTracingProcessor(TracingProcessor):
|
|
38
42
|
def __init__(self) -> None:
|
|
39
43
|
_check_openai_agents_available()
|
|
44
|
+
patch_default_agent_run_single_turn()
|
|
45
|
+
patch_default_agent_run_single_turn_streamed()
|
|
40
46
|
self.span_observers: dict[str, Observer] = {}
|
|
41
47
|
|
|
42
48
|
def on_trace_start(self, trace: "Trace") -> None:
|
|
@@ -46,66 +52,62 @@ class DeepEvalTracingProcessor(TracingProcessor):
|
|
|
46
52
|
_trace_name = trace_dict.get("workflow_name")
|
|
47
53
|
_trace_metadata = trace_dict.get("metadata")
|
|
48
54
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
children=[],
|
|
65
|
-
)
|
|
55
|
+
_trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
|
|
56
|
+
_trace.thread_id = str(_thread_id)
|
|
57
|
+
_trace.name = str(_trace_name)
|
|
58
|
+
_trace.metadata = make_json_serializable(_trace_metadata)
|
|
59
|
+
current_trace_context.set(_trace)
|
|
60
|
+
|
|
61
|
+
trace_manager.add_span( # adds a dummy root span
|
|
62
|
+
BaseSpan(
|
|
63
|
+
uuid=_trace_uuid,
|
|
64
|
+
trace_uuid=_trace_uuid,
|
|
65
|
+
parent_uuid=None,
|
|
66
|
+
start_time=perf_counter(),
|
|
67
|
+
name=_trace_name,
|
|
68
|
+
status=TraceSpanStatus.IN_PROGRESS,
|
|
69
|
+
children=[],
|
|
66
70
|
)
|
|
67
|
-
|
|
68
|
-
current_trace = current_trace_context.get()
|
|
69
|
-
if current_trace:
|
|
70
|
-
current_trace.name = str(_trace_name)
|
|
71
|
+
)
|
|
71
72
|
|
|
72
73
|
def on_trace_end(self, trace: "Trace") -> None:
|
|
73
74
|
trace_dict = trace.export()
|
|
74
75
|
_trace_uuid = trace_dict.get("id")
|
|
75
|
-
_thread_id = trace_dict.get("group_id")
|
|
76
76
|
_trace_name = trace_dict.get("workflow_name")
|
|
77
|
-
_trace_metadata = trace_dict.get("metadata")
|
|
78
77
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
) # removing the dummy root span
|
|
83
|
-
trace_manager.end_trace(_trace_uuid)
|
|
84
|
-
current_trace_context.set(None)
|
|
78
|
+
trace_manager.remove_span(_trace_uuid) # removing the dummy root span
|
|
79
|
+
trace_manager.end_trace(_trace_uuid)
|
|
80
|
+
current_trace_context.set(None)
|
|
85
81
|
|
|
86
82
|
def on_span_start(self, span: "Span") -> None:
|
|
87
83
|
if not span.started_at:
|
|
88
84
|
return
|
|
85
|
+
current_span = current_span_context.get()
|
|
86
|
+
if current_span and isinstance(current_span, LlmSpan):
|
|
87
|
+
return
|
|
88
|
+
|
|
89
89
|
span_type = self.get_span_kind(span.span_data)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
observer.
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
observer.__enter__()
|
|
90
|
+
observer = Observer(span_type=span_type, func_name="NA")
|
|
91
|
+
if span_type == "llm":
|
|
92
|
+
observer.observe_kwargs["model"] = "temporary model"
|
|
93
|
+
observer.update_span_properties = (
|
|
94
|
+
lambda span_type: update_span_properties(span_type, span.span_data)
|
|
95
|
+
)
|
|
96
|
+
self.span_observers[span.span_id] = observer
|
|
97
|
+
observer.__enter__()
|
|
99
98
|
|
|
100
99
|
def on_span_end(self, span: "Span") -> None:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
100
|
+
update_trace_properties_from_span_data(
|
|
101
|
+
current_trace_context.get(), span.span_data
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
current_span = current_span_context.get()
|
|
105
|
+
if current_span and isinstance(current_span, LlmSpan):
|
|
106
|
+
update_span_properties(current_span, span.span_data)
|
|
107
|
+
return
|
|
108
|
+
observer = self.span_observers.pop(span.span_id, None)
|
|
109
|
+
if observer:
|
|
110
|
+
observer.__exit__(None, None, None)
|
|
109
111
|
|
|
110
112
|
def force_flush(self) -> None:
|
|
111
113
|
pass
|
|
@@ -116,19 +118,18 @@ class DeepEvalTracingProcessor(TracingProcessor):
|
|
|
116
118
|
def get_span_kind(self, span_data: "SpanData") -> str:
|
|
117
119
|
if isinstance(span_data, AgentSpanData):
|
|
118
120
|
return "agent"
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
return None
|
|
121
|
+
if isinstance(span_data, FunctionSpanData):
|
|
122
|
+
return "tool"
|
|
123
|
+
if isinstance(span_data, MCPListToolsSpanData):
|
|
124
|
+
return "tool"
|
|
125
|
+
if isinstance(span_data, GenerationSpanData):
|
|
126
|
+
return "llm"
|
|
127
|
+
if isinstance(span_data, ResponseSpanData):
|
|
128
|
+
return "llm"
|
|
129
|
+
if isinstance(span_data, HandoffSpanData):
|
|
130
|
+
return "custom"
|
|
131
|
+
if isinstance(span_data, CustomSpanData):
|
|
132
|
+
return "base"
|
|
133
|
+
if isinstance(span_data, GuardrailSpanData):
|
|
134
|
+
return "base"
|
|
135
|
+
return "base"
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
from deepeval.tracing.types import Trace
|
|
1
2
|
from openai.types.responses.response_input_item_param import (
|
|
2
3
|
FunctionCallOutput,
|
|
3
4
|
Message,
|
|
4
5
|
)
|
|
5
6
|
from openai.types.responses.response_output_message_param import Content
|
|
6
|
-
from typing import Union, List
|
|
7
|
+
from typing import Union, List, Optional
|
|
7
8
|
from openai.types.responses import (
|
|
8
9
|
ResponseFunctionToolCallParam,
|
|
9
10
|
ResponseOutputMessageParam,
|
|
@@ -25,6 +26,8 @@ from deepeval.tracing.types import (
|
|
|
25
26
|
)
|
|
26
27
|
import json
|
|
27
28
|
|
|
29
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
30
|
+
|
|
28
31
|
try:
|
|
29
32
|
from agents import MCPListToolsSpanData
|
|
30
33
|
from agents.tracing.span_data import (
|
|
@@ -89,13 +92,17 @@ def update_span_properties_from_response_span_data(
|
|
|
89
92
|
return
|
|
90
93
|
# Extract usage tokens
|
|
91
94
|
usage = response.usage
|
|
95
|
+
cached_input_tokens = None
|
|
96
|
+
ouptut_reasoning_tokens = None
|
|
92
97
|
if usage:
|
|
93
98
|
output_tokens = usage.output_tokens
|
|
94
99
|
input_tokens = usage.input_tokens
|
|
95
100
|
cached_input_tokens = usage.input_tokens_details.cached_tokens
|
|
96
101
|
ouptut_reasoning_tokens = usage.output_tokens_details.reasoning_tokens
|
|
97
102
|
# Get input and output
|
|
98
|
-
input = parse_response_input(
|
|
103
|
+
input = parse_response_input(
|
|
104
|
+
span_data.input, span_data.response.instructions
|
|
105
|
+
)
|
|
99
106
|
raw_output = parse_response_output(response.output)
|
|
100
107
|
output = (
|
|
101
108
|
raw_output if isinstance(raw_output, str) else json.dumps(raw_output)
|
|
@@ -112,6 +119,23 @@ def update_span_properties_from_response_span_data(
|
|
|
112
119
|
span.input = input
|
|
113
120
|
span.output = output
|
|
114
121
|
span.name = "LLM Generation"
|
|
122
|
+
response_dict = response.model_dump(exclude_none=True, mode="json")
|
|
123
|
+
span.metadata["invocation_params"] = {
|
|
124
|
+
k: v
|
|
125
|
+
for k, v in response_dict.items()
|
|
126
|
+
if k
|
|
127
|
+
in (
|
|
128
|
+
"max_output_tokens",
|
|
129
|
+
"parallel_tool_calls",
|
|
130
|
+
"reasoning",
|
|
131
|
+
"temperature",
|
|
132
|
+
"text",
|
|
133
|
+
"tool_choice",
|
|
134
|
+
"tools",
|
|
135
|
+
"top_p",
|
|
136
|
+
"truncation",
|
|
137
|
+
)
|
|
138
|
+
}
|
|
115
139
|
|
|
116
140
|
|
|
117
141
|
def update_span_properties_from_generation_span_data(
|
|
@@ -136,6 +160,11 @@ def update_span_properties_from_generation_span_data(
|
|
|
136
160
|
span.input = input
|
|
137
161
|
span.output = output
|
|
138
162
|
span.name = "LLM Generation"
|
|
163
|
+
span.metadata["invocation_params"] = {
|
|
164
|
+
"model_config": make_json_serializable(
|
|
165
|
+
generation_span_data.model_config
|
|
166
|
+
),
|
|
167
|
+
}
|
|
139
168
|
|
|
140
169
|
|
|
141
170
|
########################################################
|
|
@@ -191,8 +220,6 @@ def update_span_properties_from_agent_span_data(
|
|
|
191
220
|
if agent_span_data.output_type:
|
|
192
221
|
metadata["output_type"] = agent_span_data.output_type
|
|
193
222
|
span.metadata = metadata
|
|
194
|
-
span.input = None
|
|
195
|
-
span.output = None
|
|
196
223
|
|
|
197
224
|
|
|
198
225
|
########################################################
|
|
@@ -238,10 +265,30 @@ def update_span_properties_from_guardrail_span_data(
|
|
|
238
265
|
########################################################
|
|
239
266
|
|
|
240
267
|
|
|
241
|
-
def parse_response_input(
|
|
242
|
-
|
|
243
|
-
|
|
268
|
+
def parse_response_input(
|
|
269
|
+
input: Union[str, List[ResponseInputItemParam]],
|
|
270
|
+
instructions: Optional[Union[str, List[ResponseInputItemParam]]] = None,
|
|
271
|
+
):
|
|
272
|
+
|
|
244
273
|
processed_input = []
|
|
274
|
+
|
|
275
|
+
if isinstance(input, str) and isinstance(instructions, str):
|
|
276
|
+
return [
|
|
277
|
+
{"type": "message", "role": "system", "content": instructions},
|
|
278
|
+
{"type": "message", "role": "user", "content": input},
|
|
279
|
+
]
|
|
280
|
+
elif isinstance(input, list) and isinstance(instructions, list):
|
|
281
|
+
input = instructions + input
|
|
282
|
+
elif isinstance(input, list) and isinstance(instructions, str):
|
|
283
|
+
processed_input += [
|
|
284
|
+
{"type": "message", "role": "system", "content": instructions}
|
|
285
|
+
]
|
|
286
|
+
elif isinstance(input, str) and isinstance(instructions, list):
|
|
287
|
+
processed_input += [
|
|
288
|
+
{"type": "message", "role": "user", "content": input}
|
|
289
|
+
]
|
|
290
|
+
input = instructions
|
|
291
|
+
|
|
245
292
|
for item in input:
|
|
246
293
|
if "type" not in item:
|
|
247
294
|
if "role" in item and "content" in item:
|
|
@@ -365,3 +412,32 @@ def parse_function_call(
|
|
|
365
412
|
"name": function_call.name,
|
|
366
413
|
"arguments": function_call.arguments,
|
|
367
414
|
}
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def update_trace_properties_from_span_data(
|
|
418
|
+
trace: Trace,
|
|
419
|
+
span_data: Union["ResponseSpanData", "GenerationSpanData"],
|
|
420
|
+
):
|
|
421
|
+
if isinstance(span_data, ResponseSpanData):
|
|
422
|
+
if not trace.input:
|
|
423
|
+
trace.input = parse_response_input(
|
|
424
|
+
span_data.input, span_data.response.instructions
|
|
425
|
+
)
|
|
426
|
+
raw_output = parse_response_output(span_data.response.output)
|
|
427
|
+
output = (
|
|
428
|
+
raw_output
|
|
429
|
+
if isinstance(raw_output, str)
|
|
430
|
+
else json.dumps(raw_output)
|
|
431
|
+
)
|
|
432
|
+
trace.output = output
|
|
433
|
+
|
|
434
|
+
elif isinstance(span_data, GenerationSpanData):
|
|
435
|
+
if not trace.input:
|
|
436
|
+
trace.input = span_data.input
|
|
437
|
+
raw_output = span_data.output
|
|
438
|
+
output = (
|
|
439
|
+
raw_output
|
|
440
|
+
if isinstance(raw_output, str)
|
|
441
|
+
else json.dumps(raw_output)
|
|
442
|
+
)
|
|
443
|
+
trace.output = output
|