deepeval 3.5.3__py3-none-any.whl → 3.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +14 -0
- deepeval/constants.py +2 -1
- deepeval/dataset/dataset.py +11 -4
- deepeval/dataset/types.py +19 -11
- deepeval/dataset/utils.py +31 -3
- deepeval/evaluate/execute.py +216 -17
- deepeval/integrations/pydantic_ai/__init__.py +3 -1
- deepeval/integrations/pydantic_ai/agent.py +339 -0
- deepeval/integrations/pydantic_ai/patcher.py +479 -406
- deepeval/integrations/pydantic_ai/utils.py +239 -2
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/non_advice/non_advice.py +2 -2
- deepeval/metrics/pii_leakage/pii_leakage.py +1 -1
- deepeval/openai_agents/agent.py +115 -106
- deepeval/openai_agents/callback_handler.py +21 -30
- deepeval/openai_agents/runner.py +288 -71
- deepeval/tracing/tracing.py +1 -3
- {deepeval-3.5.3.dist-info → deepeval-3.5.5.dist-info}/METADATA +3 -1
- {deepeval-3.5.3.dist-info → deepeval-3.5.5.dist-info}/RECORD +23 -22
- {deepeval-3.5.3.dist-info → deepeval-3.5.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.5.3.dist-info → deepeval-3.5.5.dist-info}/WHEEL +0 -0
- {deepeval-3.5.3.dist-info → deepeval-3.5.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,8 +1,29 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from time import perf_counter
|
|
2
|
+
from contextlib import asynccontextmanager
|
|
3
|
+
import inspect
|
|
4
|
+
import functools
|
|
5
|
+
from typing import Any, Callable, List, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic_ai.models import Model
|
|
3
8
|
from pydantic_ai.agent import AgentRunResult
|
|
4
9
|
from pydantic_ai._run_context import RunContext
|
|
10
|
+
from pydantic_ai.messages import (
|
|
11
|
+
ModelRequest,
|
|
12
|
+
ModelResponse,
|
|
13
|
+
ModelResponsePart,
|
|
14
|
+
SystemPromptPart,
|
|
15
|
+
TextPart,
|
|
16
|
+
ToolCallPart,
|
|
17
|
+
ToolReturnPart,
|
|
18
|
+
UserPromptPart,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from deepeval.prompt import Prompt
|
|
22
|
+
from deepeval.tracing.tracing import Observer
|
|
23
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
5
24
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
25
|
+
from deepeval.tracing.context import current_trace_context, current_span_context
|
|
26
|
+
from deepeval.tracing.types import AgentSpan, LlmOutput, LlmSpan, LlmToolCall
|
|
6
27
|
|
|
7
28
|
|
|
8
29
|
# llm tools called
|
|
@@ -84,3 +105,219 @@ def sanitize_run_context(value):
|
|
|
84
105
|
return {sanitize_run_context(v) for v in value}
|
|
85
106
|
|
|
86
107
|
return value
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def patch_llm_model(
|
|
111
|
+
model: Model,
|
|
112
|
+
llm_metric_collection: Optional[str] = None,
|
|
113
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
114
|
+
llm_prompt: Optional[Prompt] = None,
|
|
115
|
+
):
|
|
116
|
+
original_func = model.request
|
|
117
|
+
sig = inspect.signature(original_func)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
model_name = model.model_name
|
|
121
|
+
except Exception:
|
|
122
|
+
model_name = "unknown"
|
|
123
|
+
|
|
124
|
+
@functools.wraps(original_func)
|
|
125
|
+
async def wrapper(*args, **kwargs):
|
|
126
|
+
bound = sig.bind_partial(*args, **kwargs)
|
|
127
|
+
bound.apply_defaults()
|
|
128
|
+
request = bound.arguments.get("messages", [])
|
|
129
|
+
|
|
130
|
+
with Observer(
|
|
131
|
+
span_type="llm",
|
|
132
|
+
func_name="LLM",
|
|
133
|
+
observe_kwargs={"model": model_name},
|
|
134
|
+
metrics=llm_metrics,
|
|
135
|
+
metric_collection=llm_metric_collection,
|
|
136
|
+
) as observer:
|
|
137
|
+
result = await original_func(*args, **kwargs)
|
|
138
|
+
observer.update_span_properties = (
|
|
139
|
+
lambda llm_span: set_llm_span_attributes(
|
|
140
|
+
llm_span, request, result, llm_prompt
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
observer.result = result
|
|
144
|
+
return result
|
|
145
|
+
|
|
146
|
+
model.request = wrapper
|
|
147
|
+
|
|
148
|
+
stream_original_func = model.request_stream
|
|
149
|
+
stream_sig = inspect.signature(stream_original_func)
|
|
150
|
+
|
|
151
|
+
@asynccontextmanager
|
|
152
|
+
async def stream_wrapper(*args, **kwargs):
|
|
153
|
+
bound = stream_sig.bind_partial(*args, **kwargs)
|
|
154
|
+
bound.apply_defaults()
|
|
155
|
+
request = bound.arguments.get("messages", [])
|
|
156
|
+
|
|
157
|
+
with Observer(
|
|
158
|
+
span_type="llm",
|
|
159
|
+
func_name="LLM",
|
|
160
|
+
observe_kwargs={"model": model_name},
|
|
161
|
+
metrics=llm_metrics,
|
|
162
|
+
metric_collection=llm_metric_collection,
|
|
163
|
+
) as observer:
|
|
164
|
+
llm_span: LlmSpan = current_span_context.get()
|
|
165
|
+
async with stream_original_func(
|
|
166
|
+
*args, **kwargs
|
|
167
|
+
) as streamed_response:
|
|
168
|
+
try:
|
|
169
|
+
yield streamed_response
|
|
170
|
+
if not llm_span.token_intervals:
|
|
171
|
+
llm_span.token_intervals = {perf_counter(): "NA"}
|
|
172
|
+
else:
|
|
173
|
+
llm_span.token_intervals[perf_counter()] = "NA"
|
|
174
|
+
finally:
|
|
175
|
+
try:
|
|
176
|
+
result = streamed_response.get()
|
|
177
|
+
observer.update_span_properties = (
|
|
178
|
+
lambda llm_span: set_llm_span_attributes(
|
|
179
|
+
llm_span, request, result, llm_prompt
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
observer.result = result
|
|
183
|
+
except Exception:
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
model.request_stream = stream_wrapper
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def create_patched_tool(
|
|
190
|
+
func: Callable,
|
|
191
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
192
|
+
metric_collection: Optional[str] = None,
|
|
193
|
+
):
|
|
194
|
+
import asyncio
|
|
195
|
+
|
|
196
|
+
original_func = func
|
|
197
|
+
|
|
198
|
+
is_async = asyncio.iscoroutinefunction(original_func)
|
|
199
|
+
|
|
200
|
+
if is_async:
|
|
201
|
+
|
|
202
|
+
@functools.wraps(original_func)
|
|
203
|
+
async def async_wrapper(*args, **kwargs):
|
|
204
|
+
sanitized_args = sanitize_run_context(args)
|
|
205
|
+
sanitized_kwargs = sanitize_run_context(kwargs)
|
|
206
|
+
with Observer(
|
|
207
|
+
span_type="tool",
|
|
208
|
+
func_name=original_func.__name__,
|
|
209
|
+
metrics=metrics,
|
|
210
|
+
metric_collection=metric_collection,
|
|
211
|
+
function_kwargs={"args": sanitized_args, **sanitized_kwargs},
|
|
212
|
+
) as observer:
|
|
213
|
+
result = await original_func(*args, **kwargs)
|
|
214
|
+
observer.result = result
|
|
215
|
+
|
|
216
|
+
return result
|
|
217
|
+
|
|
218
|
+
return async_wrapper
|
|
219
|
+
else:
|
|
220
|
+
|
|
221
|
+
@functools.wraps(original_func)
|
|
222
|
+
def sync_wrapper(*args, **kwargs):
|
|
223
|
+
sanitized_args = sanitize_run_context(args)
|
|
224
|
+
sanitized_kwargs = sanitize_run_context(kwargs)
|
|
225
|
+
with Observer(
|
|
226
|
+
span_type="tool",
|
|
227
|
+
func_name=original_func.__name__,
|
|
228
|
+
metrics=metrics,
|
|
229
|
+
metric_collection=metric_collection,
|
|
230
|
+
function_kwargs={"args": sanitized_args, **sanitized_kwargs},
|
|
231
|
+
) as observer:
|
|
232
|
+
result = original_func(*args, **kwargs)
|
|
233
|
+
observer.result = result
|
|
234
|
+
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
return sync_wrapper
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def update_trace_context(
|
|
241
|
+
trace_name: Optional[str] = None,
|
|
242
|
+
trace_tags: Optional[List[str]] = None,
|
|
243
|
+
trace_metadata: Optional[dict] = None,
|
|
244
|
+
trace_thread_id: Optional[str] = None,
|
|
245
|
+
trace_user_id: Optional[str] = None,
|
|
246
|
+
trace_metric_collection: Optional[str] = None,
|
|
247
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
248
|
+
trace_input: Optional[Any] = None,
|
|
249
|
+
trace_output: Optional[Any] = None,
|
|
250
|
+
):
|
|
251
|
+
|
|
252
|
+
current_trace = current_trace_context.get()
|
|
253
|
+
|
|
254
|
+
if trace_name:
|
|
255
|
+
current_trace.name = trace_name
|
|
256
|
+
if trace_tags:
|
|
257
|
+
current_trace.tags = trace_tags
|
|
258
|
+
if trace_metadata:
|
|
259
|
+
current_trace.metadata = trace_metadata
|
|
260
|
+
if trace_thread_id:
|
|
261
|
+
current_trace.thread_id = trace_thread_id
|
|
262
|
+
if trace_user_id:
|
|
263
|
+
current_trace.user_id = trace_user_id
|
|
264
|
+
if trace_metric_collection:
|
|
265
|
+
current_trace.metric_collection = trace_metric_collection
|
|
266
|
+
if trace_metrics:
|
|
267
|
+
current_trace.metrics = trace_metrics
|
|
268
|
+
if trace_input:
|
|
269
|
+
current_trace.input = trace_input
|
|
270
|
+
if trace_output:
|
|
271
|
+
current_trace.output = trace_output
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def set_llm_span_attributes(
|
|
275
|
+
llm_span: LlmSpan,
|
|
276
|
+
requests: List[ModelRequest],
|
|
277
|
+
result: ModelResponse,
|
|
278
|
+
llm_prompt: Optional[Prompt] = None,
|
|
279
|
+
):
|
|
280
|
+
llm_span.prompt = llm_prompt
|
|
281
|
+
|
|
282
|
+
input = []
|
|
283
|
+
for request in requests:
|
|
284
|
+
for part in request.parts:
|
|
285
|
+
if isinstance(part, SystemPromptPart):
|
|
286
|
+
input.append({"role": "System", "content": part.content})
|
|
287
|
+
elif isinstance(part, UserPromptPart):
|
|
288
|
+
input.append({"role": "User", "content": part.content})
|
|
289
|
+
elif isinstance(part, ToolCallPart):
|
|
290
|
+
input.append(
|
|
291
|
+
{
|
|
292
|
+
"role": "Tool Call",
|
|
293
|
+
"name": part.tool_name,
|
|
294
|
+
"content": part.args_as_json_str(),
|
|
295
|
+
}
|
|
296
|
+
)
|
|
297
|
+
elif isinstance(part, ToolReturnPart):
|
|
298
|
+
input.append(
|
|
299
|
+
{
|
|
300
|
+
"role": "Tool Return",
|
|
301
|
+
"name": part.tool_name,
|
|
302
|
+
"content": part.model_response_str(),
|
|
303
|
+
}
|
|
304
|
+
)
|
|
305
|
+
llm_span.input = input
|
|
306
|
+
|
|
307
|
+
content = ""
|
|
308
|
+
tool_calls = []
|
|
309
|
+
for part in result.parts:
|
|
310
|
+
if isinstance(part, TextPart):
|
|
311
|
+
content += part.content + "\n"
|
|
312
|
+
elif isinstance(part, ToolCallPart):
|
|
313
|
+
tool_calls.append(
|
|
314
|
+
LlmToolCall(name=part.tool_name, args=part.args_as_dict())
|
|
315
|
+
)
|
|
316
|
+
llm_span.output = LlmOutput(
|
|
317
|
+
role="Assistant", content=content, tool_calls=tool_calls
|
|
318
|
+
)
|
|
319
|
+
llm_span.tools_called = extract_tools_called_from_llm_response(result.parts)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
|
|
323
|
+
agent_span.tools_called = extract_tools_called(result)
|
|
@@ -283,8 +283,9 @@ class MCPUseMetric(BaseMetric):
|
|
|
283
283
|
mcp_resources_called: List[MCPResourceCall],
|
|
284
284
|
mcp_prompts_called: List[MCPPromptCall],
|
|
285
285
|
) -> tuple[str, str]:
|
|
286
|
+
available_primitives = "MCP Primitives Available: \n"
|
|
286
287
|
for mcp_server in mcp_servers:
|
|
287
|
-
available_primitives
|
|
288
|
+
available_primitives += f"MCP Server {mcp_server.server_name}\n"
|
|
288
289
|
available_primitives += (
|
|
289
290
|
(
|
|
290
291
|
"\nAvailable Tools:\n[\n"
|
|
@@ -43,7 +43,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
43
43
|
"or ['financial', 'medical'] for multiple types."
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
self.threshold =
|
|
46
|
+
self.threshold = 1 if strict_mode else threshold
|
|
47
47
|
self.advice_types = advice_types
|
|
48
48
|
self.model, self.using_native_model = initialize_model(model)
|
|
49
49
|
self.evaluation_model = self.model.get_model_name()
|
|
@@ -293,7 +293,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
293
293
|
appropriate_advice_count += 1
|
|
294
294
|
|
|
295
295
|
score = appropriate_advice_count / number_of_verdicts
|
|
296
|
-
return
|
|
296
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
297
297
|
|
|
298
298
|
def is_successful(self) -> bool:
|
|
299
299
|
if self.error is not None:
|
|
@@ -35,7 +35,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
35
35
|
verbose_mode: bool = False,
|
|
36
36
|
evaluation_template: Type[PIILeakageTemplate] = PIILeakageTemplate,
|
|
37
37
|
):
|
|
38
|
-
self.threshold =
|
|
38
|
+
self.threshold = 1 if strict_mode else threshold
|
|
39
39
|
self.model, self.using_native_model = initialize_model(model)
|
|
40
40
|
self.evaluation_model = self.model.get_model_name()
|
|
41
41
|
self.include_reason = include_reason
|
deepeval/openai_agents/agent.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field, replace
|
|
4
|
-
from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar
|
|
4
|
+
from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar, List
|
|
5
5
|
|
|
6
6
|
from deepeval.tracing import observe
|
|
7
7
|
from deepeval.prompt import Prompt
|
|
8
|
+
from deepeval.tracing.tracing import Observer
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
11
|
+
from deepeval.tracing.types import LlmSpan
|
|
12
|
+
from deepeval.tracing.context import current_span_context
|
|
8
13
|
|
|
9
14
|
try:
|
|
10
15
|
from agents.agent import Agent as BaseAgent
|
|
11
16
|
from agents.models.interface import Model, ModelProvider
|
|
17
|
+
from openai.types.responses import ResponseCompletedEvent
|
|
12
18
|
except Exception as e:
|
|
13
19
|
raise RuntimeError(
|
|
14
20
|
"openai-agents is required for this integration. Please install it."
|
|
@@ -21,17 +27,15 @@ class _ObservedModel(Model):
|
|
|
21
27
|
def __init__(
|
|
22
28
|
self,
|
|
23
29
|
inner: Model,
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
deepeval_prompt: Optional[Any] = None,
|
|
30
|
+
llm_metric_collection: str = None,
|
|
31
|
+
llm_metrics: List[BaseMetric] = None,
|
|
32
|
+
confident_prompt: Prompt = None,
|
|
28
33
|
) -> None:
|
|
29
34
|
self._inner = inner
|
|
30
|
-
self.
|
|
31
|
-
self.
|
|
32
|
-
self.
|
|
35
|
+
self._llm_metric_collection = llm_metric_collection
|
|
36
|
+
self._llm_metrics = llm_metrics
|
|
37
|
+
self._confident_prompt = confident_prompt
|
|
33
38
|
|
|
34
|
-
# Delegate attributes not overridden
|
|
35
39
|
def __getattr__(self, name: str) -> Any:
|
|
36
40
|
return getattr(self._inner, name)
|
|
37
41
|
|
|
@@ -59,29 +63,48 @@ class _ObservedModel(Model):
|
|
|
59
63
|
previous_response_id,
|
|
60
64
|
conversation_id,
|
|
61
65
|
prompt,
|
|
66
|
+
**kwargs,
|
|
62
67
|
):
|
|
63
68
|
model_name = self._get_model_name()
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
69
|
+
with Observer(
|
|
70
|
+
span_type="llm",
|
|
71
|
+
func_name="LLM",
|
|
72
|
+
function_kwargs={
|
|
73
|
+
"system_instructions": system_instructions,
|
|
74
|
+
"input": input,
|
|
75
|
+
"model_settings": model_settings,
|
|
76
|
+
"tools": tools,
|
|
77
|
+
"output_schema": output_schema,
|
|
78
|
+
"handoffs": handoffs,
|
|
79
|
+
# "tracing": tracing, # not important for llm spans
|
|
80
|
+
# "previous_response_id": previous_response_id, # not important for llm spans
|
|
81
|
+
# "conversation_id": conversation_id, # not important for llm spans
|
|
82
|
+
"prompt": prompt,
|
|
83
|
+
**kwargs,
|
|
84
|
+
},
|
|
85
|
+
observe_kwargs={"model": model_name},
|
|
86
|
+
metrics=self._llm_metrics,
|
|
87
|
+
metric_collection=self._llm_metric_collection,
|
|
88
|
+
) as observer:
|
|
89
|
+
result = await self._inner.get_response(
|
|
90
|
+
system_instructions,
|
|
91
|
+
input,
|
|
92
|
+
model_settings,
|
|
93
|
+
tools,
|
|
94
|
+
output_schema,
|
|
95
|
+
handoffs,
|
|
96
|
+
tracing,
|
|
97
|
+
previous_response_id=previous_response_id,
|
|
98
|
+
conversation_id=conversation_id,
|
|
99
|
+
prompt=prompt,
|
|
100
|
+
**kwargs,
|
|
101
|
+
)
|
|
102
|
+
llm_span: LlmSpan = current_span_context.get()
|
|
103
|
+
llm_span.prompt = self._confident_prompt
|
|
104
|
+
|
|
105
|
+
observer.result = make_json_serializable(result.output)
|
|
106
|
+
|
|
107
|
+
return result
|
|
85
108
|
|
|
86
109
|
def stream_response(
|
|
87
110
|
self,
|
|
@@ -96,91 +119,77 @@ class _ObservedModel(Model):
|
|
|
96
119
|
previous_response_id,
|
|
97
120
|
conversation_id,
|
|
98
121
|
prompt,
|
|
122
|
+
**kwargs,
|
|
99
123
|
):
|
|
100
|
-
|
|
101
|
-
# wrapped = observe(
|
|
102
|
-
# metrics=self._metrics,
|
|
103
|
-
# metric_collection=self._metric_collection,
|
|
104
|
-
# type="llm",
|
|
105
|
-
# model=model_name,
|
|
106
|
-
# )(self._inner.stream_response)
|
|
107
|
-
# return wrapped(
|
|
108
|
-
# system_instructions,
|
|
109
|
-
# input,
|
|
110
|
-
# model_settings,
|
|
111
|
-
# tools,
|
|
112
|
-
# output_schema,
|
|
113
|
-
# handoffs,
|
|
114
|
-
# tracing,
|
|
115
|
-
# previous_response_id=previous_response_id,
|
|
116
|
-
# conversation_id=conversation_id,
|
|
117
|
-
# prompt=prompt,
|
|
118
|
-
# )
|
|
119
|
-
return self._inner.stream_response(
|
|
120
|
-
system_instructions,
|
|
121
|
-
input,
|
|
122
|
-
model_settings,
|
|
123
|
-
tools,
|
|
124
|
-
output_schema,
|
|
125
|
-
handoffs,
|
|
126
|
-
tracing,
|
|
127
|
-
previous_response_id=previous_response_id,
|
|
128
|
-
conversation_id=conversation_id,
|
|
129
|
-
prompt=prompt,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
class _ObservedProvider(ModelProvider):
|
|
134
|
-
def __init__(
|
|
135
|
-
self,
|
|
136
|
-
base: ModelProvider,
|
|
137
|
-
*,
|
|
138
|
-
metrics: Optional[list[Any]] = None,
|
|
139
|
-
metric_collection: Optional[str] = None,
|
|
140
|
-
deepeval_prompt: Optional[Any] = None,
|
|
141
|
-
) -> None:
|
|
142
|
-
self._base = base
|
|
143
|
-
self._metrics = metrics
|
|
144
|
-
self._metric_collection = metric_collection
|
|
145
|
-
self._deepeval_prompt = deepeval_prompt
|
|
124
|
+
model_name = self._get_model_name()
|
|
146
125
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
126
|
+
async def _gen():
|
|
127
|
+
observer = Observer(
|
|
128
|
+
span_type="llm",
|
|
129
|
+
func_name="LLM",
|
|
130
|
+
function_kwargs={
|
|
131
|
+
"system_instructions": system_instructions,
|
|
132
|
+
"input": input,
|
|
133
|
+
"model_settings": model_settings,
|
|
134
|
+
"tools": tools,
|
|
135
|
+
"output_schema": output_schema,
|
|
136
|
+
"handoffs": handoffs,
|
|
137
|
+
# "tracing": tracing,
|
|
138
|
+
# "previous_response_id": previous_response_id,
|
|
139
|
+
# "conversation_id": conversation_id,
|
|
140
|
+
"prompt": prompt,
|
|
141
|
+
**kwargs,
|
|
142
|
+
},
|
|
143
|
+
observe_kwargs={"model": model_name},
|
|
144
|
+
metrics=self._llm_metrics,
|
|
145
|
+
metric_collection=self._llm_metric_collection,
|
|
146
|
+
)
|
|
147
|
+
observer.__enter__()
|
|
148
|
+
|
|
149
|
+
llm_span: LlmSpan = current_span_context.get()
|
|
150
|
+
llm_span.prompt = self._confident_prompt
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
async for event in self._inner.stream_response(
|
|
154
|
+
system_instructions,
|
|
155
|
+
input,
|
|
156
|
+
model_settings,
|
|
157
|
+
tools,
|
|
158
|
+
output_schema,
|
|
159
|
+
handoffs,
|
|
160
|
+
tracing,
|
|
161
|
+
previous_response_id=previous_response_id,
|
|
162
|
+
conversation_id=conversation_id,
|
|
163
|
+
prompt=prompt,
|
|
164
|
+
):
|
|
165
|
+
|
|
166
|
+
if isinstance(event, ResponseCompletedEvent):
|
|
167
|
+
observer.result = (
|
|
168
|
+
event.response.output_text
|
|
169
|
+
) # TODO: support other response types
|
|
170
|
+
|
|
171
|
+
yield event
|
|
172
|
+
|
|
173
|
+
observer.__exit__(None, None, None)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
observer.__exit__(type(e), e, e.__traceback__)
|
|
176
|
+
raise
|
|
177
|
+
finally:
|
|
178
|
+
|
|
179
|
+
observer.__exit__(None, None, None)
|
|
180
|
+
|
|
181
|
+
return _gen()
|
|
155
182
|
|
|
156
183
|
|
|
157
184
|
@dataclass
|
|
158
185
|
class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
|
|
159
186
|
"""
|
|
160
|
-
A subclass of agents.Agent
|
|
161
|
-
and ensures the underlying model's `get_response` is wrapped with deepeval.observe.
|
|
187
|
+
A subclass of agents.Agent.
|
|
162
188
|
"""
|
|
163
189
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
190
|
+
llm_metric_collection: str = None
|
|
191
|
+
llm_metrics: List[BaseMetric] = None
|
|
192
|
+
confident_prompt: Prompt = None
|
|
167
193
|
|
|
168
194
|
def __post_init__(self):
|
|
169
195
|
super().__post_init__()
|
|
170
|
-
# If a direct Model instance is set on the agent, wrap it here.
|
|
171
|
-
if self.model is not None and not isinstance(self.model, str):
|
|
172
|
-
try:
|
|
173
|
-
from agents.models.interface import (
|
|
174
|
-
Model as _Model,
|
|
175
|
-
) # local import for safety
|
|
176
|
-
|
|
177
|
-
if isinstance(self.model, _Model):
|
|
178
|
-
self.model = _ObservedModel(
|
|
179
|
-
self.model,
|
|
180
|
-
metrics=self.metrics,
|
|
181
|
-
metric_collection=self.metric_collection,
|
|
182
|
-
deepeval_prompt=self.deepeval_prompt,
|
|
183
|
-
)
|
|
184
|
-
except Exception:
|
|
185
|
-
# If we can't import or wrap, silently skip.
|
|
186
|
-
pass
|
|
@@ -46,17 +46,7 @@ class DeepEvalTracingProcessor(TracingProcessor):
|
|
|
46
46
|
if not span.started_at:
|
|
47
47
|
return
|
|
48
48
|
span_type = self.get_span_kind(span.span_data)
|
|
49
|
-
if span_type == "agent":
|
|
50
|
-
if isinstance(span.span_data, AgentSpanData):
|
|
51
|
-
current_trace = current_trace_context.get()
|
|
52
|
-
if current_trace:
|
|
53
|
-
current_trace.name = span.span_data.name
|
|
54
|
-
|
|
55
|
-
if span_type == "tool":
|
|
56
|
-
return
|
|
57
|
-
elif span_type == "llm":
|
|
58
|
-
return
|
|
59
|
-
else:
|
|
49
|
+
if span_type and span_type == "agent":
|
|
60
50
|
observer = Observer(span_type=span_type, func_name="NA")
|
|
61
51
|
observer.update_span_properties = (
|
|
62
52
|
lambda base_span: update_span_properties(
|
|
@@ -68,13 +58,13 @@ class DeepEvalTracingProcessor(TracingProcessor):
|
|
|
68
58
|
|
|
69
59
|
def on_span_end(self, span: "Span") -> None:
|
|
70
60
|
span_type = self.get_span_kind(span.span_data)
|
|
71
|
-
if span_type == "
|
|
61
|
+
if span_type and span_type == "agent":
|
|
72
62
|
current_span = current_span_context.get()
|
|
73
63
|
if current_span:
|
|
74
64
|
update_span_properties(current_span, span.span_data)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
65
|
+
observer = self.span_observers.pop(span.span_id, None)
|
|
66
|
+
if observer:
|
|
67
|
+
observer.__exit__(None, None, None)
|
|
78
68
|
|
|
79
69
|
def force_flush(self) -> None:
|
|
80
70
|
pass
|
|
@@ -85,18 +75,19 @@ class DeepEvalTracingProcessor(TracingProcessor):
|
|
|
85
75
|
def get_span_kind(self, span_data: "SpanData") -> str:
|
|
86
76
|
if isinstance(span_data, AgentSpanData):
|
|
87
77
|
return "agent"
|
|
88
|
-
if isinstance(span_data, FunctionSpanData):
|
|
89
|
-
|
|
90
|
-
if isinstance(span_data, MCPListToolsSpanData):
|
|
91
|
-
|
|
92
|
-
if isinstance(span_data, GenerationSpanData):
|
|
93
|
-
|
|
94
|
-
if isinstance(span_data, ResponseSpanData):
|
|
95
|
-
|
|
96
|
-
if isinstance(span_data, HandoffSpanData):
|
|
97
|
-
|
|
98
|
-
if isinstance(span_data, CustomSpanData):
|
|
99
|
-
|
|
100
|
-
if isinstance(span_data, GuardrailSpanData):
|
|
101
|
-
|
|
102
|
-
return "base"
|
|
78
|
+
# if isinstance(span_data, FunctionSpanData):
|
|
79
|
+
# return "tool"
|
|
80
|
+
# if isinstance(span_data, MCPListToolsSpanData):
|
|
81
|
+
# return "tool"
|
|
82
|
+
# if isinstance(span_data, GenerationSpanData):
|
|
83
|
+
# return "llm"
|
|
84
|
+
# if isinstance(span_data, ResponseSpanData):
|
|
85
|
+
# return "llm"
|
|
86
|
+
# if isinstance(span_data, HandoffSpanData):
|
|
87
|
+
# return "custom"
|
|
88
|
+
# if isinstance(span_data, CustomSpanData):
|
|
89
|
+
# return "base"
|
|
90
|
+
# if isinstance(span_data, GuardrailSpanData):
|
|
91
|
+
# return "base"
|
|
92
|
+
# return "base"
|
|
93
|
+
return None
|