deepeval 3.5.8__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  from deepeval.openai_agents.callback_handler import DeepEvalTracingProcessor
2
- from deepeval.openai_agents.runner import Runner
3
- from deepeval.openai_agents.patch import function_tool
4
2
  from deepeval.openai_agents.agent import DeepEvalAgent as Agent
3
+ from deepeval.openai_agents.patch import function_tool
4
+
5
+ # from deepeval.openai_agents.runner import Runner
5
6
 
6
- __all__ = ["DeepEvalTracingProcessor", "Runner", "function_tool", "Agent"]
7
+ __all__ = ["DeepEvalTracingProcessor", "Agent", "function_tool"]
@@ -1,20 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass, field, replace
4
- from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar, List
3
+ from dataclasses import dataclass
4
+ from typing import Generic, TypeVar, List
5
5
 
6
- from deepeval.tracing import observe
7
6
  from deepeval.prompt import Prompt
8
- from deepeval.tracing.tracing import Observer
9
7
  from deepeval.metrics import BaseMetric
10
- from deepeval.tracing.utils import make_json_serializable
11
8
  from deepeval.tracing.types import LlmSpan
12
- from deepeval.tracing.context import current_span_context
13
9
 
14
10
  try:
15
11
  from agents.agent import Agent as BaseAgent
16
- from agents.models.interface import Model, ModelProvider
17
- from openai.types.responses import ResponseCompletedEvent
12
+ from deepeval.openai_agents.patch import (
13
+ patch_default_agent_runner_get_model,
14
+ )
18
15
  except Exception as e:
19
16
  raise RuntimeError(
20
17
  "openai-agents is required for this integration. Please install it."
@@ -23,163 +20,6 @@ except Exception as e:
23
20
  TContext = TypeVar("TContext")
24
21
 
25
22
 
26
- class _ObservedModel(Model):
27
- def __init__(
28
- self,
29
- inner: Model,
30
- llm_metric_collection: str = None,
31
- llm_metrics: List[BaseMetric] = None,
32
- confident_prompt: Prompt = None,
33
- ) -> None:
34
- self._inner = inner
35
- self._llm_metric_collection = llm_metric_collection
36
- self._llm_metrics = llm_metrics
37
- self._confident_prompt = confident_prompt
38
-
39
- def __getattr__(self, name: str) -> Any:
40
- return getattr(self._inner, name)
41
-
42
- def _get_model_name(self) -> str:
43
- try:
44
- for attr in ("model", "model_name", "name"):
45
- if hasattr(self._inner, attr):
46
- val = getattr(self._inner, attr)
47
- if val is not None:
48
- return str(val)
49
- except Exception:
50
- pass
51
- return "unknown"
52
-
53
- async def get_response(
54
- self,
55
- system_instructions,
56
- input,
57
- model_settings,
58
- tools,
59
- output_schema,
60
- handoffs,
61
- tracing,
62
- *,
63
- previous_response_id,
64
- conversation_id,
65
- prompt,
66
- **kwargs,
67
- ):
68
- model_name = self._get_model_name()
69
- with Observer(
70
- span_type="llm",
71
- func_name="LLM",
72
- function_kwargs={
73
- "system_instructions": system_instructions,
74
- "input": input,
75
- "model_settings": model_settings,
76
- "tools": tools,
77
- "output_schema": output_schema,
78
- "handoffs": handoffs,
79
- # "tracing": tracing, # not important for llm spans
80
- # "previous_response_id": previous_response_id, # not important for llm spans
81
- # "conversation_id": conversation_id, # not important for llm spans
82
- "prompt": prompt,
83
- **kwargs,
84
- },
85
- observe_kwargs={"model": model_name},
86
- metrics=self._llm_metrics,
87
- metric_collection=self._llm_metric_collection,
88
- ) as observer:
89
- result = await self._inner.get_response(
90
- system_instructions,
91
- input,
92
- model_settings,
93
- tools,
94
- output_schema,
95
- handoffs,
96
- tracing,
97
- previous_response_id=previous_response_id,
98
- conversation_id=conversation_id,
99
- prompt=prompt,
100
- **kwargs,
101
- )
102
- llm_span: LlmSpan = current_span_context.get()
103
- llm_span.prompt = self._confident_prompt
104
-
105
- observer.result = make_json_serializable(result.output)
106
-
107
- return result
108
-
109
- def stream_response(
110
- self,
111
- system_instructions,
112
- input,
113
- model_settings,
114
- tools,
115
- output_schema,
116
- handoffs,
117
- tracing,
118
- *,
119
- previous_response_id,
120
- conversation_id,
121
- prompt,
122
- **kwargs,
123
- ):
124
- model_name = self._get_model_name()
125
-
126
- async def _gen():
127
- observer = Observer(
128
- span_type="llm",
129
- func_name="LLM",
130
- function_kwargs={
131
- "system_instructions": system_instructions,
132
- "input": input,
133
- "model_settings": model_settings,
134
- "tools": tools,
135
- "output_schema": output_schema,
136
- "handoffs": handoffs,
137
- # "tracing": tracing,
138
- # "previous_response_id": previous_response_id,
139
- # "conversation_id": conversation_id,
140
- "prompt": prompt,
141
- **kwargs,
142
- },
143
- observe_kwargs={"model": model_name},
144
- metrics=self._llm_metrics,
145
- metric_collection=self._llm_metric_collection,
146
- )
147
- observer.__enter__()
148
-
149
- llm_span: LlmSpan = current_span_context.get()
150
- llm_span.prompt = self._confident_prompt
151
-
152
- try:
153
- async for event in self._inner.stream_response(
154
- system_instructions,
155
- input,
156
- model_settings,
157
- tools,
158
- output_schema,
159
- handoffs,
160
- tracing,
161
- previous_response_id=previous_response_id,
162
- conversation_id=conversation_id,
163
- prompt=prompt,
164
- ):
165
-
166
- if isinstance(event, ResponseCompletedEvent):
167
- observer.result = make_json_serializable(
168
- event.response.output
169
- )
170
-
171
- yield event
172
-
173
- except Exception as e:
174
- observer.__exit__(type(e), e, e.__traceback__)
175
- raise
176
- finally:
177
-
178
- observer.__exit__(None, None, None)
179
-
180
- return _gen()
181
-
182
-
183
23
  @dataclass
184
24
  class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
185
25
  """
@@ -189,6 +29,8 @@ class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
189
29
  llm_metric_collection: str = None
190
30
  llm_metrics: List[BaseMetric] = None
191
31
  confident_prompt: Prompt = None
32
+ agent_metrics: List[BaseMetric] = None
33
+ agent_metric_collection: str = None
192
34
 
193
35
  def __post_init__(self):
194
- super().__post_init__()
36
+ patch_default_agent_runner_get_model()
@@ -21,6 +21,10 @@ try:
21
21
  ResponseSpanData,
22
22
  SpanData,
23
23
  )
24
+ from deepeval.openai_agents.patch import (
25
+ patch_default_agent_run_single_turn,
26
+ patch_default_agent_run_single_turn_streamed,
27
+ )
24
28
 
25
29
  openai_agents_available = True
26
30
  except ImportError:
@@ -37,6 +41,8 @@ def _check_openai_agents_available():
37
41
  class DeepEvalTracingProcessor(TracingProcessor):
38
42
  def __init__(self) -> None:
39
43
  _check_openai_agents_available()
44
+ patch_default_agent_run_single_turn()
45
+ patch_default_agent_run_single_turn_streamed()
40
46
  self.span_observers: dict[str, Observer] = {}
41
47
 
42
48
  def on_trace_start(self, trace: "Trace") -> None:
@@ -46,66 +52,62 @@ class DeepEvalTracingProcessor(TracingProcessor):
46
52
  _trace_name = trace_dict.get("workflow_name")
47
53
  _trace_metadata = trace_dict.get("metadata")
48
54
 
49
- if _thread_id or _trace_metadata:
50
- _trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
51
- _trace.thread_id = str(_thread_id)
52
- _trace.name = str(_trace_name)
53
- _trace.metadata = make_json_serializable(_trace_metadata)
54
- current_trace_context.set(_trace)
55
-
56
- trace_manager.add_span( # adds a dummy root span
57
- BaseSpan(
58
- uuid=_trace_uuid,
59
- trace_uuid=_trace_uuid,
60
- parent_uuid=None,
61
- start_time=perf_counter(),
62
- name=_trace_name,
63
- status=TraceSpanStatus.IN_PROGRESS,
64
- children=[],
65
- )
55
+ _trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
56
+ _trace.thread_id = str(_thread_id)
57
+ _trace.name = str(_trace_name)
58
+ _trace.metadata = make_json_serializable(_trace_metadata)
59
+ current_trace_context.set(_trace)
60
+
61
+ trace_manager.add_span( # adds a dummy root span
62
+ BaseSpan(
63
+ uuid=_trace_uuid,
64
+ trace_uuid=_trace_uuid,
65
+ parent_uuid=None,
66
+ start_time=perf_counter(),
67
+ name=_trace_name,
68
+ status=TraceSpanStatus.IN_PROGRESS,
69
+ children=[],
66
70
  )
67
- else:
68
- current_trace = current_trace_context.get()
69
- if current_trace:
70
- current_trace.name = str(_trace_name)
71
+ )
71
72
 
72
73
  def on_trace_end(self, trace: "Trace") -> None:
73
74
  trace_dict = trace.export()
74
75
  _trace_uuid = trace_dict.get("id")
75
- _thread_id = trace_dict.get("group_id")
76
76
  _trace_name = trace_dict.get("workflow_name")
77
- _trace_metadata = trace_dict.get("metadata")
78
77
 
79
- if _thread_id or _trace_metadata:
80
- trace_manager.remove_span(
81
- _trace_uuid
82
- ) # removing the dummy root span
83
- trace_manager.end_trace(_trace_uuid)
84
- current_trace_context.set(None)
78
+ trace_manager.remove_span(_trace_uuid) # removing the dummy root span
79
+ trace_manager.end_trace(_trace_uuid)
80
+ current_trace_context.set(None)
85
81
 
86
82
  def on_span_start(self, span: "Span") -> None:
87
83
  if not span.started_at:
88
84
  return
85
+ current_span = current_span_context.get()
86
+ if current_span and isinstance(current_span, LlmSpan):
87
+ return
88
+
89
89
  span_type = self.get_span_kind(span.span_data)
90
- if span_type and span_type == "agent":
91
- observer = Observer(span_type=span_type, func_name="NA")
92
- observer.update_span_properties = (
93
- lambda base_span: update_span_properties(
94
- base_span, span.span_data
95
- )
96
- )
97
- self.span_observers[span.span_id] = observer
98
- observer.__enter__()
90
+ observer = Observer(span_type=span_type, func_name="NA")
91
+ if span_type == "llm":
92
+ observer.observe_kwargs["model"] = "temporary model"
93
+ observer.update_span_properties = (
94
+ lambda span_type: update_span_properties(span_type, span.span_data)
95
+ )
96
+ self.span_observers[span.span_id] = observer
97
+ observer.__enter__()
99
98
 
100
99
  def on_span_end(self, span: "Span") -> None:
101
- span_type = self.get_span_kind(span.span_data)
102
- if span_type and span_type == "agent":
103
- current_span = current_span_context.get()
104
- if current_span:
105
- update_span_properties(current_span, span.span_data)
106
- observer = self.span_observers.pop(span.span_id, None)
107
- if observer:
108
- observer.__exit__(None, None, None)
100
+ update_trace_properties_from_span_data(
101
+ current_trace_context.get(), span.span_data
102
+ )
103
+
104
+ current_span = current_span_context.get()
105
+ if current_span and isinstance(current_span, LlmSpan):
106
+ update_span_properties(current_span, span.span_data)
107
+ return
108
+ observer = self.span_observers.pop(span.span_id, None)
109
+ if observer:
110
+ observer.__exit__(None, None, None)
109
111
 
110
112
  def force_flush(self) -> None:
111
113
  pass
@@ -116,19 +118,18 @@ class DeepEvalTracingProcessor(TracingProcessor):
116
118
  def get_span_kind(self, span_data: "SpanData") -> str:
117
119
  if isinstance(span_data, AgentSpanData):
118
120
  return "agent"
119
- # if isinstance(span_data, FunctionSpanData):
120
- # return "tool"
121
- # if isinstance(span_data, MCPListToolsSpanData):
122
- # return "tool"
123
- # if isinstance(span_data, GenerationSpanData):
124
- # return "llm"
125
- # if isinstance(span_data, ResponseSpanData):
126
- # return "llm"
127
- # if isinstance(span_data, HandoffSpanData):
128
- # return "custom"
129
- # if isinstance(span_data, CustomSpanData):
130
- # return "base"
131
- # if isinstance(span_data, GuardrailSpanData):
132
- # return "base"
133
- # return "base"
134
- return None
121
+ if isinstance(span_data, FunctionSpanData):
122
+ return "tool"
123
+ if isinstance(span_data, MCPListToolsSpanData):
124
+ return "tool"
125
+ if isinstance(span_data, GenerationSpanData):
126
+ return "llm"
127
+ if isinstance(span_data, ResponseSpanData):
128
+ return "llm"
129
+ if isinstance(span_data, HandoffSpanData):
130
+ return "custom"
131
+ if isinstance(span_data, CustomSpanData):
132
+ return "base"
133
+ if isinstance(span_data, GuardrailSpanData):
134
+ return "base"
135
+ return "base"
@@ -1,9 +1,10 @@
1
+ from deepeval.tracing.types import Trace
1
2
  from openai.types.responses.response_input_item_param import (
2
3
  FunctionCallOutput,
3
4
  Message,
4
5
  )
5
6
  from openai.types.responses.response_output_message_param import Content
6
- from typing import Union, List
7
+ from typing import Union, List, Optional
7
8
  from openai.types.responses import (
8
9
  ResponseFunctionToolCallParam,
9
10
  ResponseOutputMessageParam,
@@ -25,6 +26,8 @@ from deepeval.tracing.types import (
25
26
  )
26
27
  import json
27
28
 
29
+ from deepeval.tracing.utils import make_json_serializable
30
+
28
31
  try:
29
32
  from agents import MCPListToolsSpanData
30
33
  from agents.tracing.span_data import (
@@ -89,13 +92,17 @@ def update_span_properties_from_response_span_data(
89
92
  return
90
93
  # Extract usage tokens
91
94
  usage = response.usage
95
+ cached_input_tokens = None
96
+ ouptut_reasoning_tokens = None
92
97
  if usage:
93
98
  output_tokens = usage.output_tokens
94
99
  input_tokens = usage.input_tokens
95
100
  cached_input_tokens = usage.input_tokens_details.cached_tokens
96
101
  ouptut_reasoning_tokens = usage.output_tokens_details.reasoning_tokens
97
102
  # Get input and output
98
- input = parse_response_input(span_data.input)
103
+ input = parse_response_input(
104
+ span_data.input, span_data.response.instructions
105
+ )
99
106
  raw_output = parse_response_output(response.output)
100
107
  output = (
101
108
  raw_output if isinstance(raw_output, str) else json.dumps(raw_output)
@@ -112,6 +119,23 @@ def update_span_properties_from_response_span_data(
112
119
  span.input = input
113
120
  span.output = output
114
121
  span.name = "LLM Generation"
122
+ response_dict = response.model_dump(exclude_none=True, mode="json")
123
+ span.metadata["invocation_params"] = {
124
+ k: v
125
+ for k, v in response_dict.items()
126
+ if k
127
+ in (
128
+ "max_output_tokens",
129
+ "parallel_tool_calls",
130
+ "reasoning",
131
+ "temperature",
132
+ "text",
133
+ "tool_choice",
134
+ "tools",
135
+ "top_p",
136
+ "truncation",
137
+ )
138
+ }
115
139
 
116
140
 
117
141
  def update_span_properties_from_generation_span_data(
@@ -136,6 +160,11 @@ def update_span_properties_from_generation_span_data(
136
160
  span.input = input
137
161
  span.output = output
138
162
  span.name = "LLM Generation"
163
+ span.metadata["invocation_params"] = {
164
+ "model_config": make_json_serializable(
165
+ generation_span_data.model_config
166
+ ),
167
+ }
139
168
 
140
169
 
141
170
  ########################################################
@@ -191,8 +220,6 @@ def update_span_properties_from_agent_span_data(
191
220
  if agent_span_data.output_type:
192
221
  metadata["output_type"] = agent_span_data.output_type
193
222
  span.metadata = metadata
194
- span.input = None
195
- span.output = None
196
223
 
197
224
 
198
225
  ########################################################
@@ -238,10 +265,30 @@ def update_span_properties_from_guardrail_span_data(
238
265
  ########################################################
239
266
 
240
267
 
241
- def parse_response_input(input: Union[str, List[ResponseInputItemParam]]):
242
- if isinstance(input, str):
243
- return input
268
+ def parse_response_input(
269
+ input: Union[str, List[ResponseInputItemParam]],
270
+ instructions: Optional[Union[str, List[ResponseInputItemParam]]] = None,
271
+ ):
272
+
244
273
  processed_input = []
274
+
275
+ if isinstance(input, str) and isinstance(instructions, str):
276
+ return [
277
+ {"type": "message", "role": "system", "content": instructions},
278
+ {"type": "message", "role": "user", "content": input},
279
+ ]
280
+ elif isinstance(input, list) and isinstance(instructions, list):
281
+ input = instructions + input
282
+ elif isinstance(input, list) and isinstance(instructions, str):
283
+ processed_input += [
284
+ {"type": "message", "role": "system", "content": instructions}
285
+ ]
286
+ elif isinstance(input, str) and isinstance(instructions, list):
287
+ processed_input += [
288
+ {"type": "message", "role": "user", "content": input}
289
+ ]
290
+ input = instructions
291
+
245
292
  for item in input:
246
293
  if "type" not in item:
247
294
  if "role" in item and "content" in item:
@@ -365,3 +412,32 @@ def parse_function_call(
365
412
  "name": function_call.name,
366
413
  "arguments": function_call.arguments,
367
414
  }
415
+
416
+
417
+ def update_trace_properties_from_span_data(
418
+ trace: Trace,
419
+ span_data: Union["ResponseSpanData", "GenerationSpanData"],
420
+ ):
421
+ if isinstance(span_data, ResponseSpanData):
422
+ if not trace.input:
423
+ trace.input = parse_response_input(
424
+ span_data.input, span_data.response.instructions
425
+ )
426
+ raw_output = parse_response_output(span_data.response.output)
427
+ output = (
428
+ raw_output
429
+ if isinstance(raw_output, str)
430
+ else json.dumps(raw_output)
431
+ )
432
+ trace.output = output
433
+
434
+ elif isinstance(span_data, GenerationSpanData):
435
+ if not trace.input:
436
+ trace.input = span_data.input
437
+ raw_output = span_data.output
438
+ output = (
439
+ raw_output
440
+ if isinstance(raw_output, str)
441
+ else json.dumps(raw_output)
442
+ )
443
+ trace.output = output