deepeval 3.5.4__py3-none-any.whl → 3.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field, replace
4
- from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar
4
+ from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar, List
5
5
 
6
6
  from deepeval.tracing import observe
7
7
  from deepeval.prompt import Prompt
8
+ from deepeval.tracing.tracing import Observer
9
+ from deepeval.metrics import BaseMetric
10
+ from deepeval.tracing.utils import make_json_serializable
11
+ from deepeval.tracing.types import LlmSpan
12
+ from deepeval.tracing.context import current_span_context
8
13
 
9
14
  try:
10
15
  from agents.agent import Agent as BaseAgent
11
16
  from agents.models.interface import Model, ModelProvider
17
+ from openai.types.responses import ResponseCompletedEvent
12
18
  except Exception as e:
13
19
  raise RuntimeError(
14
20
  "openai-agents is required for this integration. Please install it."
@@ -21,17 +27,15 @@ class _ObservedModel(Model):
21
27
  def __init__(
22
28
  self,
23
29
  inner: Model,
24
- *,
25
- metrics: Optional[list[Any]] = None,
26
- metric_collection: Optional[str] = None,
27
- deepeval_prompt: Optional[Any] = None,
30
+ llm_metric_collection: str = None,
31
+ llm_metrics: List[BaseMetric] = None,
32
+ confident_prompt: Prompt = None,
28
33
  ) -> None:
29
34
  self._inner = inner
30
- self._metrics = metrics
31
- self._metric_collection = metric_collection
32
- self._deepeval_prompt = deepeval_prompt
35
+ self._llm_metric_collection = llm_metric_collection
36
+ self._llm_metrics = llm_metrics
37
+ self._confident_prompt = confident_prompt
33
38
 
34
- # Delegate attributes not overridden
35
39
  def __getattr__(self, name: str) -> Any:
36
40
  return getattr(self._inner, name)
37
41
 
@@ -59,29 +63,48 @@ class _ObservedModel(Model):
59
63
  previous_response_id,
60
64
  conversation_id,
61
65
  prompt,
66
+ **kwargs,
62
67
  ):
63
68
  model_name = self._get_model_name()
64
-
65
- wrapped = observe(
66
- metrics=self._metrics,
67
- metric_collection=self._metric_collection,
68
- type="llm",
69
- model=model_name,
70
- prompt=self._deepeval_prompt,
71
- )(self._inner.get_response)
72
-
73
- return await wrapped(
74
- system_instructions,
75
- input,
76
- model_settings,
77
- tools,
78
- output_schema,
79
- handoffs,
80
- tracing,
81
- previous_response_id=previous_response_id,
82
- conversation_id=conversation_id,
83
- prompt=prompt,
84
- )
69
+ with Observer(
70
+ span_type="llm",
71
+ func_name="LLM",
72
+ function_kwargs={
73
+ "system_instructions": system_instructions,
74
+ "input": input,
75
+ "model_settings": model_settings,
76
+ "tools": tools,
77
+ "output_schema": output_schema,
78
+ "handoffs": handoffs,
79
+ # "tracing": tracing, # not important for llm spans
80
+ # "previous_response_id": previous_response_id, # not important for llm spans
81
+ # "conversation_id": conversation_id, # not important for llm spans
82
+ "prompt": prompt,
83
+ **kwargs,
84
+ },
85
+ observe_kwargs={"model": model_name},
86
+ metrics=self._llm_metrics,
87
+ metric_collection=self._llm_metric_collection,
88
+ ) as observer:
89
+ result = await self._inner.get_response(
90
+ system_instructions,
91
+ input,
92
+ model_settings,
93
+ tools,
94
+ output_schema,
95
+ handoffs,
96
+ tracing,
97
+ previous_response_id=previous_response_id,
98
+ conversation_id=conversation_id,
99
+ prompt=prompt,
100
+ **kwargs,
101
+ )
102
+ llm_span: LlmSpan = current_span_context.get()
103
+ llm_span.prompt = self._confident_prompt
104
+
105
+ observer.result = make_json_serializable(result.output)
106
+
107
+ return result
85
108
 
86
109
  def stream_response(
87
110
  self,
@@ -96,91 +119,77 @@ class _ObservedModel(Model):
96
119
  previous_response_id,
97
120
  conversation_id,
98
121
  prompt,
122
+ **kwargs,
99
123
  ):
100
- # Optional: if you also want to observe streaming, uncomment and wrap similarly.
101
- # wrapped = observe(
102
- # metrics=self._metrics,
103
- # metric_collection=self._metric_collection,
104
- # type="llm",
105
- # model=model_name,
106
- # )(self._inner.stream_response)
107
- # return wrapped(
108
- # system_instructions,
109
- # input,
110
- # model_settings,
111
- # tools,
112
- # output_schema,
113
- # handoffs,
114
- # tracing,
115
- # previous_response_id=previous_response_id,
116
- # conversation_id=conversation_id,
117
- # prompt=prompt,
118
- # )
119
- return self._inner.stream_response(
120
- system_instructions,
121
- input,
122
- model_settings,
123
- tools,
124
- output_schema,
125
- handoffs,
126
- tracing,
127
- previous_response_id=previous_response_id,
128
- conversation_id=conversation_id,
129
- prompt=prompt,
130
- )
131
-
132
-
133
- class _ObservedProvider(ModelProvider):
134
- def __init__(
135
- self,
136
- base: ModelProvider,
137
- *,
138
- metrics: Optional[list[Any]] = None,
139
- metric_collection: Optional[str] = None,
140
- deepeval_prompt: Optional[Any] = None,
141
- ) -> None:
142
- self._base = base
143
- self._metrics = metrics
144
- self._metric_collection = metric_collection
145
- self._deepeval_prompt = deepeval_prompt
124
+ model_name = self._get_model_name()
146
125
 
147
- def get_model(self, model_name: str | None) -> Model:
148
- model = self._base.get_model(model_name)
149
- return _ObservedModel(
150
- model,
151
- metrics=self._metrics,
152
- metric_collection=self._metric_collection,
153
- deepeval_prompt=self._deepeval_prompt,
154
- )
126
+ async def _gen():
127
+ observer = Observer(
128
+ span_type="llm",
129
+ func_name="LLM",
130
+ function_kwargs={
131
+ "system_instructions": system_instructions,
132
+ "input": input,
133
+ "model_settings": model_settings,
134
+ "tools": tools,
135
+ "output_schema": output_schema,
136
+ "handoffs": handoffs,
137
+ # "tracing": tracing,
138
+ # "previous_response_id": previous_response_id,
139
+ # "conversation_id": conversation_id,
140
+ "prompt": prompt,
141
+ **kwargs,
142
+ },
143
+ observe_kwargs={"model": model_name},
144
+ metrics=self._llm_metrics,
145
+ metric_collection=self._llm_metric_collection,
146
+ )
147
+ observer.__enter__()
148
+
149
+ llm_span: LlmSpan = current_span_context.get()
150
+ llm_span.prompt = self._confident_prompt
151
+
152
+ try:
153
+ async for event in self._inner.stream_response(
154
+ system_instructions,
155
+ input,
156
+ model_settings,
157
+ tools,
158
+ output_schema,
159
+ handoffs,
160
+ tracing,
161
+ previous_response_id=previous_response_id,
162
+ conversation_id=conversation_id,
163
+ prompt=prompt,
164
+ ):
165
+
166
+ if isinstance(event, ResponseCompletedEvent):
167
+ observer.result = (
168
+ event.response.output_text
169
+ ) # TODO: support other response types
170
+
171
+ yield event
172
+
173
+ observer.__exit__(None, None, None)
174
+ except Exception as e:
175
+ observer.__exit__(type(e), e, e.__traceback__)
176
+ raise
177
+ finally:
178
+
179
+ observer.__exit__(None, None, None)
180
+
181
+ return _gen()
155
182
 
156
183
 
157
184
  @dataclass
158
185
  class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
159
186
  """
160
- A subclass of agents.Agent that accepts `metrics` and `metric_collection`
161
- and ensures the underlying model's `get_response` is wrapped with deepeval.observe.
187
+ A subclass of agents.Agent.
162
188
  """
163
189
 
164
- metrics: list[Any] | None = field(default=None)
165
- metric_collection: str | None = field(default=None)
166
- deepeval_prompt: Prompt | None = field(default=None)
190
+ llm_metric_collection: str = None
191
+ llm_metrics: List[BaseMetric] = None
192
+ confident_prompt: Prompt = None
167
193
 
168
194
  def __post_init__(self):
169
195
  super().__post_init__()
170
- # If a direct Model instance is set on the agent, wrap it here.
171
- if self.model is not None and not isinstance(self.model, str):
172
- try:
173
- from agents.models.interface import (
174
- Model as _Model,
175
- ) # local import for safety
176
-
177
- if isinstance(self.model, _Model):
178
- self.model = _ObservedModel(
179
- self.model,
180
- metrics=self.metrics,
181
- metric_collection=self.metric_collection,
182
- deepeval_prompt=self.deepeval_prompt,
183
- )
184
- except Exception:
185
- # If we can't import or wrap, silently skip.
186
- pass
@@ -46,17 +46,7 @@ class DeepEvalTracingProcessor(TracingProcessor):
46
46
  if not span.started_at:
47
47
  return
48
48
  span_type = self.get_span_kind(span.span_data)
49
- if span_type == "agent":
50
- if isinstance(span.span_data, AgentSpanData):
51
- current_trace = current_trace_context.get()
52
- if current_trace:
53
- current_trace.name = span.span_data.name
54
-
55
- if span_type == "tool":
56
- return
57
- elif span_type == "llm":
58
- return
59
- else:
49
+ if span_type and span_type == "agent":
60
50
  observer = Observer(span_type=span_type, func_name="NA")
61
51
  observer.update_span_properties = (
62
52
  lambda base_span: update_span_properties(
@@ -68,13 +58,13 @@ class DeepEvalTracingProcessor(TracingProcessor):
68
58
 
69
59
  def on_span_end(self, span: "Span") -> None:
70
60
  span_type = self.get_span_kind(span.span_data)
71
- if span_type == "llm":
61
+ if span_type and span_type == "agent":
72
62
  current_span = current_span_context.get()
73
63
  if current_span:
74
64
  update_span_properties(current_span, span.span_data)
75
- observer = self.span_observers.pop(span.span_id, None)
76
- if observer:
77
- observer.__exit__(None, None, None)
65
+ observer = self.span_observers.pop(span.span_id, None)
66
+ if observer:
67
+ observer.__exit__(None, None, None)
78
68
 
79
69
  def force_flush(self) -> None:
80
70
  pass
@@ -85,18 +75,19 @@ class DeepEvalTracingProcessor(TracingProcessor):
85
75
  def get_span_kind(self, span_data: "SpanData") -> str:
86
76
  if isinstance(span_data, AgentSpanData):
87
77
  return "agent"
88
- if isinstance(span_data, FunctionSpanData):
89
- return "tool"
90
- if isinstance(span_data, MCPListToolsSpanData):
91
- return "tool"
92
- if isinstance(span_data, GenerationSpanData):
93
- return "llm"
94
- if isinstance(span_data, ResponseSpanData):
95
- return "llm"
96
- if isinstance(span_data, HandoffSpanData):
97
- return "custom"
98
- if isinstance(span_data, CustomSpanData):
99
- return "base"
100
- if isinstance(span_data, GuardrailSpanData):
101
- return "base"
102
- return "base"
78
+ # if isinstance(span_data, FunctionSpanData):
79
+ # return "tool"
80
+ # if isinstance(span_data, MCPListToolsSpanData):
81
+ # return "tool"
82
+ # if isinstance(span_data, GenerationSpanData):
83
+ # return "llm"
84
+ # if isinstance(span_data, ResponseSpanData):
85
+ # return "llm"
86
+ # if isinstance(span_data, HandoffSpanData):
87
+ # return "custom"
88
+ # if isinstance(span_data, CustomSpanData):
89
+ # return "base"
90
+ # if isinstance(span_data, GuardrailSpanData):
91
+ # return "base"
92
+ # return "base"
93
+ return None