deepeval 3.5.9__py3-none-any.whl → 3.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.5.9"
1
+ __version__: str = "3.6.1"
@@ -15,7 +15,7 @@ from enum import Enum
15
15
  from pydantic import SecretStr
16
16
  from deepeval.config.settings import get_settings, _SAVE_RE
17
17
  from deepeval.cli.dotenv_handler import DotenvHandler
18
- from deepeval.utils import bool_to_env_str
18
+ from deepeval.config.utils import bool_to_env_str
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
  StrOrEnum = Union[str, Enum]
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from contextvars import ContextVar
4
+ from typing import TYPE_CHECKING, Optional
5
+
6
+
7
+ if TYPE_CHECKING:
8
+ from deepeval.dataset.golden import Golden
9
+
10
+
11
+ CURRENT_GOLDEN: ContextVar[Optional[Golden]] = ContextVar(
12
+ "CURRENT_GOLDEN", default=None
13
+ )
14
+
15
+
16
+ def set_current_golden(golden: Optional[Golden]):
17
+ return CURRENT_GOLDEN.set(golden)
18
+
19
+
20
+ def get_current_golden() -> Optional[Golden]:
21
+ return CURRENT_GOLDEN.get()
22
+
23
+
24
+ def reset_current_golden(token) -> None:
25
+ CURRENT_GOLDEN.reset(token)
@@ -1,5 +1,11 @@
1
+ from deepeval.contextvars import get_current_golden
1
2
  from .dataset import EvaluationDataset
2
3
  from .golden import Golden, ConversationalGolden
3
- from .test_run_tracer import init_global_test_run_tracer
4
4
 
5
- __all__ = ["EvaluationDataset", "Golden", "ConversationalGolden"]
5
+
6
+ __all__ = [
7
+ "EvaluationDataset",
8
+ "Golden",
9
+ "ConversationalGolden",
10
+ "get_current_golden",
11
+ ]
@@ -42,6 +42,7 @@ from deepeval.tracing.api import (
42
42
  BaseApiSpan,
43
43
  )
44
44
  from deepeval.dataset import Golden
45
+ from deepeval.contextvars import set_current_golden, reset_current_golden
45
46
  from deepeval.errors import MissingTestCaseParamsError
46
47
  from deepeval.metrics.utils import copy_metrics
47
48
  from deepeval.utils import (
@@ -1480,6 +1481,7 @@ def execute_agentic_test_cases_from_loop(
1480
1481
  )
1481
1482
 
1482
1483
  for golden in goldens:
1484
+ token = set_current_golden(golden)
1483
1485
  with capture_evaluation_run("golden"):
1484
1486
  # yield golden
1485
1487
  count += 1
@@ -1492,8 +1494,14 @@ def execute_agentic_test_cases_from_loop(
1492
1494
  _progress=progress,
1493
1495
  _pbar_callback_id=pbar_tags_id,
1494
1496
  ):
1495
- yield golden
1496
- current_trace: Trace = current_trace_context.get()
1497
+ try:
1498
+ # yield golden to user code
1499
+ yield golden
1500
+ # control has returned from user code without error, capture trace now
1501
+ current_trace: Trace = current_trace_context.get()
1502
+ finally:
1503
+ # after user code returns control, always reset the context
1504
+ reset_current_golden(token)
1497
1505
 
1498
1506
  update_pbar(progress, pbar_tags_id)
1499
1507
  update_pbar(progress, pbar_id)
@@ -1849,6 +1857,7 @@ def a_execute_agentic_test_cases_from_loop(
1849
1857
 
1850
1858
  try:
1851
1859
  for index, golden in enumerate(goldens):
1860
+ token = set_current_golden(golden)
1852
1861
  current_golden_ctx.update(
1853
1862
  {
1854
1863
  "index": index,
@@ -1857,7 +1866,10 @@ def a_execute_agentic_test_cases_from_loop(
1857
1866
  }
1858
1867
  )
1859
1868
  prev_task_length = len(created_tasks)
1860
- yield golden
1869
+ try:
1870
+ yield golden
1871
+ finally:
1872
+ reset_current_golden(token)
1861
1873
  # if this golden created no tasks, bump bars now
1862
1874
  if len(created_tasks) == prev_task_length:
1863
1875
  update_pbar(progress, pbar_callback_id)
@@ -1,6 +1,7 @@
1
1
  from deepeval.openai_agents.callback_handler import DeepEvalTracingProcessor
2
- from deepeval.openai_agents.runner import Runner
3
- from deepeval.openai_agents.patch import function_tool
4
2
  from deepeval.openai_agents.agent import DeepEvalAgent as Agent
3
+ from deepeval.openai_agents.patch import function_tool
4
+
5
+ # from deepeval.openai_agents.runner import Runner
5
6
 
6
- __all__ = ["DeepEvalTracingProcessor", "Runner", "function_tool", "Agent"]
7
+ __all__ = ["DeepEvalTracingProcessor", "Agent", "function_tool"]
@@ -1,20 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass, field, replace
4
- from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar, List
3
+ from dataclasses import dataclass
4
+ from typing import Generic, TypeVar, List
5
5
 
6
- from deepeval.tracing import observe
7
6
  from deepeval.prompt import Prompt
8
- from deepeval.tracing.tracing import Observer
9
7
  from deepeval.metrics import BaseMetric
10
- from deepeval.tracing.utils import make_json_serializable
11
8
  from deepeval.tracing.types import LlmSpan
12
- from deepeval.tracing.context import current_span_context
13
9
 
14
10
  try:
15
11
  from agents.agent import Agent as BaseAgent
16
- from agents.models.interface import Model, ModelProvider
17
- from openai.types.responses import ResponseCompletedEvent
12
+ from deepeval.openai_agents.patch import (
13
+ patch_default_agent_runner_get_model,
14
+ )
18
15
  except Exception as e:
19
16
  raise RuntimeError(
20
17
  "openai-agents is required for this integration. Please install it."
@@ -23,163 +20,6 @@ except Exception as e:
23
20
  TContext = TypeVar("TContext")
24
21
 
25
22
 
26
- class _ObservedModel(Model):
27
- def __init__(
28
- self,
29
- inner: Model,
30
- llm_metric_collection: str = None,
31
- llm_metrics: List[BaseMetric] = None,
32
- confident_prompt: Prompt = None,
33
- ) -> None:
34
- self._inner = inner
35
- self._llm_metric_collection = llm_metric_collection
36
- self._llm_metrics = llm_metrics
37
- self._confident_prompt = confident_prompt
38
-
39
- def __getattr__(self, name: str) -> Any:
40
- return getattr(self._inner, name)
41
-
42
- def _get_model_name(self) -> str:
43
- try:
44
- for attr in ("model", "model_name", "name"):
45
- if hasattr(self._inner, attr):
46
- val = getattr(self._inner, attr)
47
- if val is not None:
48
- return str(val)
49
- except Exception:
50
- pass
51
- return "unknown"
52
-
53
- async def get_response(
54
- self,
55
- system_instructions,
56
- input,
57
- model_settings,
58
- tools,
59
- output_schema,
60
- handoffs,
61
- tracing,
62
- *,
63
- previous_response_id,
64
- conversation_id,
65
- prompt,
66
- **kwargs,
67
- ):
68
- model_name = self._get_model_name()
69
- with Observer(
70
- span_type="llm",
71
- func_name="LLM",
72
- function_kwargs={
73
- "system_instructions": system_instructions,
74
- "input": input,
75
- "model_settings": model_settings,
76
- "tools": tools,
77
- "output_schema": output_schema,
78
- "handoffs": handoffs,
79
- # "tracing": tracing, # not important for llm spans
80
- # "previous_response_id": previous_response_id, # not important for llm spans
81
- # "conversation_id": conversation_id, # not important for llm spans
82
- "prompt": prompt,
83
- **kwargs,
84
- },
85
- observe_kwargs={"model": model_name},
86
- metrics=self._llm_metrics,
87
- metric_collection=self._llm_metric_collection,
88
- ) as observer:
89
- result = await self._inner.get_response(
90
- system_instructions,
91
- input,
92
- model_settings,
93
- tools,
94
- output_schema,
95
- handoffs,
96
- tracing,
97
- previous_response_id=previous_response_id,
98
- conversation_id=conversation_id,
99
- prompt=prompt,
100
- **kwargs,
101
- )
102
- llm_span: LlmSpan = current_span_context.get()
103
- llm_span.prompt = self._confident_prompt
104
-
105
- observer.result = make_json_serializable(result.output)
106
-
107
- return result
108
-
109
- def stream_response(
110
- self,
111
- system_instructions,
112
- input,
113
- model_settings,
114
- tools,
115
- output_schema,
116
- handoffs,
117
- tracing,
118
- *,
119
- previous_response_id,
120
- conversation_id,
121
- prompt,
122
- **kwargs,
123
- ):
124
- model_name = self._get_model_name()
125
-
126
- async def _gen():
127
- observer = Observer(
128
- span_type="llm",
129
- func_name="LLM",
130
- function_kwargs={
131
- "system_instructions": system_instructions,
132
- "input": input,
133
- "model_settings": model_settings,
134
- "tools": tools,
135
- "output_schema": output_schema,
136
- "handoffs": handoffs,
137
- # "tracing": tracing,
138
- # "previous_response_id": previous_response_id,
139
- # "conversation_id": conversation_id,
140
- "prompt": prompt,
141
- **kwargs,
142
- },
143
- observe_kwargs={"model": model_name},
144
- metrics=self._llm_metrics,
145
- metric_collection=self._llm_metric_collection,
146
- )
147
- observer.__enter__()
148
-
149
- llm_span: LlmSpan = current_span_context.get()
150
- llm_span.prompt = self._confident_prompt
151
-
152
- try:
153
- async for event in self._inner.stream_response(
154
- system_instructions,
155
- input,
156
- model_settings,
157
- tools,
158
- output_schema,
159
- handoffs,
160
- tracing,
161
- previous_response_id=previous_response_id,
162
- conversation_id=conversation_id,
163
- prompt=prompt,
164
- ):
165
-
166
- if isinstance(event, ResponseCompletedEvent):
167
- observer.result = make_json_serializable(
168
- event.response.output
169
- )
170
-
171
- yield event
172
-
173
- except Exception as e:
174
- observer.__exit__(type(e), e, e.__traceback__)
175
- raise
176
- finally:
177
-
178
- observer.__exit__(None, None, None)
179
-
180
- return _gen()
181
-
182
-
183
23
  @dataclass
184
24
  class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
185
25
  """
@@ -189,6 +29,8 @@ class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
189
29
  llm_metric_collection: str = None
190
30
  llm_metrics: List[BaseMetric] = None
191
31
  confident_prompt: Prompt = None
32
+ agent_metrics: List[BaseMetric] = None
33
+ agent_metric_collection: str = None
192
34
 
193
35
  def __post_init__(self):
194
- super().__post_init__()
36
+ patch_default_agent_runner_get_model()
@@ -21,6 +21,10 @@ try:
21
21
  ResponseSpanData,
22
22
  SpanData,
23
23
  )
24
+ from deepeval.openai_agents.patch import (
25
+ patch_default_agent_run_single_turn,
26
+ patch_default_agent_run_single_turn_streamed,
27
+ )
24
28
 
25
29
  openai_agents_available = True
26
30
  except ImportError:
@@ -37,6 +41,8 @@ def _check_openai_agents_available():
37
41
  class DeepEvalTracingProcessor(TracingProcessor):
38
42
  def __init__(self) -> None:
39
43
  _check_openai_agents_available()
44
+ patch_default_agent_run_single_turn()
45
+ patch_default_agent_run_single_turn_streamed()
40
46
  self.span_observers: dict[str, Observer] = {}
41
47
 
42
48
  def on_trace_start(self, trace: "Trace") -> None:
@@ -46,66 +52,69 @@ class DeepEvalTracingProcessor(TracingProcessor):
46
52
  _trace_name = trace_dict.get("workflow_name")
47
53
  _trace_metadata = trace_dict.get("metadata")
48
54
 
49
- if _thread_id or _trace_metadata:
50
- _trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
51
- _trace.thread_id = str(_thread_id)
52
- _trace.name = str(_trace_name)
53
- _trace.metadata = make_json_serializable(_trace_metadata)
54
- current_trace_context.set(_trace)
55
-
56
- trace_manager.add_span( # adds a dummy root span
57
- BaseSpan(
58
- uuid=_trace_uuid,
59
- trace_uuid=_trace_uuid,
60
- parent_uuid=None,
61
- start_time=perf_counter(),
62
- name=_trace_name,
63
- status=TraceSpanStatus.IN_PROGRESS,
64
- children=[],
65
- )
55
+ _trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
56
+ _trace.thread_id = str(_thread_id)
57
+ _trace.name = str(_trace_name)
58
+ _trace.metadata = make_json_serializable(_trace_metadata)
59
+ current_trace_context.set(_trace)
60
+
61
+ trace_manager.add_span( # adds a dummy root span
62
+ BaseSpan(
63
+ uuid=_trace_uuid,
64
+ trace_uuid=_trace_uuid,
65
+ parent_uuid=None,
66
+ start_time=perf_counter(),
67
+ name=_trace_name,
68
+ status=TraceSpanStatus.IN_PROGRESS,
69
+ children=[],
66
70
  )
67
- else:
68
- current_trace = current_trace_context.get()
69
- if current_trace:
70
- current_trace.name = str(_trace_name)
71
+ )
71
72
 
72
73
  def on_trace_end(self, trace: "Trace") -> None:
73
74
  trace_dict = trace.export()
74
75
  _trace_uuid = trace_dict.get("id")
75
- _thread_id = trace_dict.get("group_id")
76
76
  _trace_name = trace_dict.get("workflow_name")
77
- _trace_metadata = trace_dict.get("metadata")
78
77
 
79
- if _thread_id or _trace_metadata:
80
- trace_manager.remove_span(
81
- _trace_uuid
82
- ) # removing the dummy root span
83
- trace_manager.end_trace(_trace_uuid)
84
- current_trace_context.set(None)
78
+ trace_manager.remove_span(_trace_uuid) # removing the dummy root span
79
+ trace_manager.end_trace(_trace_uuid)
80
+ current_trace_context.set(None)
85
81
 
86
82
  def on_span_start(self, span: "Span") -> None:
87
83
  if not span.started_at:
88
84
  return
85
+ current_span = current_span_context.get()
86
+ if current_span and isinstance(
87
+ current_span, LlmSpan
88
+ ): # llm span started by
89
+ return
90
+
89
91
  span_type = self.get_span_kind(span.span_data)
90
- if span_type and span_type == "agent":
91
- observer = Observer(span_type=span_type, func_name="NA")
92
- observer.update_span_properties = (
93
- lambda base_span: update_span_properties(
94
- base_span, span.span_data
95
- )
96
- )
97
- self.span_observers[span.span_id] = observer
98
- observer.__enter__()
92
+ observer = Observer(span_type=span_type, func_name="NA")
93
+ if span_type == "llm":
94
+ observer.observe_kwargs["model"] = "temporary model"
95
+ observer.update_span_properties = (
96
+ lambda span_type: update_span_properties(span_type, span.span_data)
97
+ )
98
+ self.span_observers[span.span_id] = observer
99
+ observer.__enter__()
99
100
 
100
101
  def on_span_end(self, span: "Span") -> None:
102
+ update_trace_properties_from_span_data(
103
+ current_trace_context.get(), span.span_data
104
+ )
105
+
101
106
  span_type = self.get_span_kind(span.span_data)
102
- if span_type and span_type == "agent":
103
- current_span = current_span_context.get()
104
- if current_span:
105
- update_span_properties(current_span, span.span_data)
106
- observer = self.span_observers.pop(span.span_id, None)
107
- if observer:
108
- observer.__exit__(None, None, None)
107
+ current_span = current_span_context.get()
108
+ if (
109
+ current_span
110
+ and isinstance(current_span, LlmSpan)
111
+ and span_type == "llm"
112
+ ): # addtional check if the span kind data is llm too
113
+ update_span_properties(current_span, span.span_data)
114
+
115
+ observer = self.span_observers.pop(span.span_id, None)
116
+ if observer:
117
+ observer.__exit__(None, None, None)
109
118
 
110
119
  def force_flush(self) -> None:
111
120
  pass
@@ -116,19 +125,18 @@ class DeepEvalTracingProcessor(TracingProcessor):
116
125
  def get_span_kind(self, span_data: "SpanData") -> str:
117
126
  if isinstance(span_data, AgentSpanData):
118
127
  return "agent"
119
- # if isinstance(span_data, FunctionSpanData):
120
- # return "tool"
121
- # if isinstance(span_data, MCPListToolsSpanData):
122
- # return "tool"
123
- # if isinstance(span_data, GenerationSpanData):
124
- # return "llm"
125
- # if isinstance(span_data, ResponseSpanData):
126
- # return "llm"
127
- # if isinstance(span_data, HandoffSpanData):
128
- # return "custom"
129
- # if isinstance(span_data, CustomSpanData):
130
- # return "base"
131
- # if isinstance(span_data, GuardrailSpanData):
132
- # return "base"
133
- # return "base"
134
- return None
128
+ if isinstance(span_data, FunctionSpanData):
129
+ return "tool"
130
+ if isinstance(span_data, MCPListToolsSpanData):
131
+ return "tool"
132
+ if isinstance(span_data, GenerationSpanData):
133
+ return "llm"
134
+ if isinstance(span_data, ResponseSpanData):
135
+ return "llm"
136
+ if isinstance(span_data, HandoffSpanData):
137
+ return "custom"
138
+ if isinstance(span_data, CustomSpanData):
139
+ return "base"
140
+ if isinstance(span_data, GuardrailSpanData):
141
+ return "base"
142
+ return "base"
@@ -1,9 +1,10 @@
1
+ from deepeval.tracing.types import Trace
1
2
  from openai.types.responses.response_input_item_param import (
2
3
  FunctionCallOutput,
3
4
  Message,
4
5
  )
5
6
  from openai.types.responses.response_output_message_param import Content
6
- from typing import Union, List
7
+ from typing import Union, List, Optional
7
8
  from openai.types.responses import (
8
9
  ResponseFunctionToolCallParam,
9
10
  ResponseOutputMessageParam,
@@ -25,6 +26,8 @@ from deepeval.tracing.types import (
25
26
  )
26
27
  import json
27
28
 
29
+ from deepeval.tracing.utils import make_json_serializable
30
+
28
31
  try:
29
32
  from agents import MCPListToolsSpanData
30
33
  from agents.tracing.span_data import (
@@ -89,13 +92,17 @@ def update_span_properties_from_response_span_data(
89
92
  return
90
93
  # Extract usage tokens
91
94
  usage = response.usage
95
+ cached_input_tokens = None
96
+ ouptut_reasoning_tokens = None
92
97
  if usage:
93
98
  output_tokens = usage.output_tokens
94
99
  input_tokens = usage.input_tokens
95
100
  cached_input_tokens = usage.input_tokens_details.cached_tokens
96
101
  ouptut_reasoning_tokens = usage.output_tokens_details.reasoning_tokens
97
102
  # Get input and output
98
- input = parse_response_input(span_data.input)
103
+ input = parse_response_input(
104
+ span_data.input, span_data.response.instructions
105
+ )
99
106
  raw_output = parse_response_output(response.output)
100
107
  output = (
101
108
  raw_output if isinstance(raw_output, str) else json.dumps(raw_output)
@@ -112,6 +119,23 @@ def update_span_properties_from_response_span_data(
112
119
  span.input = input
113
120
  span.output = output
114
121
  span.name = "LLM Generation"
122
+ response_dict = response.model_dump(exclude_none=True, mode="json")
123
+ span.metadata["invocation_params"] = {
124
+ k: v
125
+ for k, v in response_dict.items()
126
+ if k
127
+ in (
128
+ "max_output_tokens",
129
+ "parallel_tool_calls",
130
+ "reasoning",
131
+ "temperature",
132
+ "text",
133
+ "tool_choice",
134
+ "tools",
135
+ "top_p",
136
+ "truncation",
137
+ )
138
+ }
115
139
 
116
140
 
117
141
  def update_span_properties_from_generation_span_data(
@@ -136,6 +160,11 @@ def update_span_properties_from_generation_span_data(
136
160
  span.input = input
137
161
  span.output = output
138
162
  span.name = "LLM Generation"
163
+ span.metadata["invocation_params"] = {
164
+ "model_config": make_json_serializable(
165
+ generation_span_data.model_config
166
+ ),
167
+ }
139
168
 
140
169
 
141
170
  ########################################################
@@ -191,8 +220,6 @@ def update_span_properties_from_agent_span_data(
191
220
  if agent_span_data.output_type:
192
221
  metadata["output_type"] = agent_span_data.output_type
193
222
  span.metadata = metadata
194
- span.input = None
195
- span.output = None
196
223
 
197
224
 
198
225
  ########################################################
@@ -238,10 +265,30 @@ def update_span_properties_from_guardrail_span_data(
238
265
  ########################################################
239
266
 
240
267
 
241
- def parse_response_input(input: Union[str, List[ResponseInputItemParam]]):
242
- if isinstance(input, str):
243
- return input
268
+ def parse_response_input(
269
+ input: Union[str, List[ResponseInputItemParam]],
270
+ instructions: Optional[Union[str, List[ResponseInputItemParam]]] = None,
271
+ ):
272
+
244
273
  processed_input = []
274
+
275
+ if isinstance(input, str) and isinstance(instructions, str):
276
+ return [
277
+ {"type": "message", "role": "system", "content": instructions},
278
+ {"type": "message", "role": "user", "content": input},
279
+ ]
280
+ elif isinstance(input, list) and isinstance(instructions, list):
281
+ input = instructions + input
282
+ elif isinstance(input, list) and isinstance(instructions, str):
283
+ processed_input += [
284
+ {"type": "message", "role": "system", "content": instructions}
285
+ ]
286
+ elif isinstance(input, str) and isinstance(instructions, list):
287
+ processed_input += [
288
+ {"type": "message", "role": "user", "content": input}
289
+ ]
290
+ input = instructions
291
+
245
292
  for item in input:
246
293
  if "type" not in item:
247
294
  if "role" in item and "content" in item:
@@ -365,3 +412,32 @@ def parse_function_call(
365
412
  "name": function_call.name,
366
413
  "arguments": function_call.arguments,
367
414
  }
415
+
416
+
417
+ def update_trace_properties_from_span_data(
418
+ trace: Trace,
419
+ span_data: Union["ResponseSpanData", "GenerationSpanData"],
420
+ ):
421
+ if isinstance(span_data, ResponseSpanData):
422
+ if not trace.input:
423
+ trace.input = parse_response_input(
424
+ span_data.input, span_data.response.instructions
425
+ )
426
+ raw_output = parse_response_output(span_data.response.output)
427
+ output = (
428
+ raw_output
429
+ if isinstance(raw_output, str)
430
+ else json.dumps(raw_output)
431
+ )
432
+ trace.output = output
433
+
434
+ elif isinstance(span_data, GenerationSpanData):
435
+ if not trace.input:
436
+ trace.input = span_data.input
437
+ raw_output = span_data.output
438
+ output = (
439
+ raw_output
440
+ if isinstance(raw_output, str)
441
+ else json.dumps(raw_output)
442
+ )
443
+ trace.output = output