deepeval 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,13 @@
1
1
  from deepeval.tracing.tracing import (
2
2
  Observer,
3
3
  current_span_context,
4
+ trace_manager,
4
5
  )
5
6
  from deepeval.openai_agents.extractors import *
6
7
  from deepeval.tracing.context import current_trace_context
8
+ from deepeval.tracing.utils import make_json_serializable
9
+ from time import perf_counter
10
+ from deepeval.tracing.types import TraceSpanStatus
7
11
 
8
12
  try:
9
13
  from agents.tracing import Span, Trace, TracingProcessor
@@ -33,30 +37,57 @@ def _check_openai_agents_available():
33
37
  class DeepEvalTracingProcessor(TracingProcessor):
34
38
  def __init__(self) -> None:
35
39
  _check_openai_agents_available()
36
- self.root_span_observers: dict[str, Observer] = {}
37
40
  self.span_observers: dict[str, Observer] = {}
38
41
 
39
42
  def on_trace_start(self, trace: "Trace") -> None:
40
- pass
43
+ trace_dict = trace.export()
44
+ _trace_uuid = trace_dict.get("id")
45
+ _thread_id = trace_dict.get("group_id")
46
+ _trace_name = trace_dict.get("workflow_name")
47
+ _trace_metadata = trace_dict.get("metadata")
48
+
49
+ if _thread_id or _trace_metadata:
50
+ _trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
51
+ _trace.thread_id = str(_thread_id)
52
+ _trace.name = str(_trace_name)
53
+ _trace.metadata = make_json_serializable(_trace_metadata)
54
+ current_trace_context.set(_trace)
55
+
56
+ trace_manager.add_span( # adds a dummy root span
57
+ BaseSpan(
58
+ uuid=_trace_uuid,
59
+ trace_uuid=_trace_uuid,
60
+ parent_uuid=None,
61
+ start_time=perf_counter(),
62
+ name=_trace_name,
63
+ status=TraceSpanStatus.IN_PROGRESS,
64
+ children=[],
65
+ )
66
+ )
67
+ else:
68
+ current_trace = current_trace_context.get()
69
+ if current_trace:
70
+ current_trace.name = str(_trace_name)
41
71
 
42
72
  def on_trace_end(self, trace: "Trace") -> None:
43
- pass
73
+ trace_dict = trace.export()
74
+ _trace_uuid = trace_dict.get("id")
75
+ _thread_id = trace_dict.get("group_id")
76
+ _trace_name = trace_dict.get("workflow_name")
77
+ _trace_metadata = trace_dict.get("metadata")
78
+
79
+ if _thread_id or _trace_metadata:
80
+ trace_manager.remove_span(
81
+ _trace_uuid
82
+ ) # removing the dummy root span
83
+ trace_manager.end_trace(_trace_uuid)
84
+ current_trace_context.set(None)
44
85
 
45
86
  def on_span_start(self, span: "Span") -> None:
46
87
  if not span.started_at:
47
88
  return
48
89
  span_type = self.get_span_kind(span.span_data)
49
- if span_type == "agent":
50
- if isinstance(span.span_data, AgentSpanData):
51
- current_trace = current_trace_context.get()
52
- if current_trace:
53
- current_trace.name = span.span_data.name
54
-
55
- if span_type == "tool":
56
- return
57
- elif span_type == "llm":
58
- return
59
- else:
90
+ if span_type and span_type == "agent":
60
91
  observer = Observer(span_type=span_type, func_name="NA")
61
92
  observer.update_span_properties = (
62
93
  lambda base_span: update_span_properties(
@@ -68,13 +99,13 @@ class DeepEvalTracingProcessor(TracingProcessor):
68
99
 
69
100
  def on_span_end(self, span: "Span") -> None:
70
101
  span_type = self.get_span_kind(span.span_data)
71
- if span_type == "llm":
102
+ if span_type and span_type == "agent":
72
103
  current_span = current_span_context.get()
73
104
  if current_span:
74
105
  update_span_properties(current_span, span.span_data)
75
- observer = self.span_observers.pop(span.span_id, None)
76
- if observer:
77
- observer.__exit__(None, None, None)
106
+ observer = self.span_observers.pop(span.span_id, None)
107
+ if observer:
108
+ observer.__exit__(None, None, None)
78
109
 
79
110
  def force_flush(self) -> None:
80
111
  pass
@@ -85,18 +116,19 @@ class DeepEvalTracingProcessor(TracingProcessor):
85
116
  def get_span_kind(self, span_data: "SpanData") -> str:
86
117
  if isinstance(span_data, AgentSpanData):
87
118
  return "agent"
88
- if isinstance(span_data, FunctionSpanData):
89
- return "tool"
90
- if isinstance(span_data, MCPListToolsSpanData):
91
- return "tool"
92
- if isinstance(span_data, GenerationSpanData):
93
- return "llm"
94
- if isinstance(span_data, ResponseSpanData):
95
- return "llm"
96
- if isinstance(span_data, HandoffSpanData):
97
- return "custom"
98
- if isinstance(span_data, CustomSpanData):
99
- return "base"
100
- if isinstance(span_data, GuardrailSpanData):
101
- return "base"
102
- return "base"
119
+ # if isinstance(span_data, FunctionSpanData):
120
+ # return "tool"
121
+ # if isinstance(span_data, MCPListToolsSpanData):
122
+ # return "tool"
123
+ # if isinstance(span_data, GenerationSpanData):
124
+ # return "llm"
125
+ # if isinstance(span_data, ResponseSpanData):
126
+ # return "llm"
127
+ # if isinstance(span_data, HandoffSpanData):
128
+ # return "custom"
129
+ # if isinstance(span_data, CustomSpanData):
130
+ # return "base"
131
+ # if isinstance(span_data, GuardrailSpanData):
132
+ # return "base"
133
+ # return "base"
134
+ return None
@@ -1,114 +1,335 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import replace
4
+ from typing import List, Any, Union, Optional
5
+
6
+ try:
7
+ from agents import (
8
+ RunConfig,
9
+ RunResult,
10
+ RunResultStreaming,
11
+ Runner as AgentsRunner,
12
+ )
13
+ from agents.agent import Agent
14
+ from agents.models.interface import ModelProvider
15
+ from agents.items import TResponseInputItem
16
+ from agents.lifecycle import RunHooks
17
+ from agents.memory import Session
18
+ from agents.run import DEFAULT_MAX_TURNS
19
+ from agents.run import AgentRunner
20
+ from agents.run_context import TContext
21
+ from agents.models.interface import Model
22
+
23
+ agents_available = True
24
+ except:
25
+ agents_available = False
26
+
27
+
28
+ def is_agents_available():
29
+ if not agents_available:
30
+ raise ImportError(
31
+ "agents is required for this integration. Install it via your package manager"
32
+ )
33
+
4
34
 
5
- from agents import (
6
- Runner as BaseRunner,
7
- RunConfig,
8
- RunResult,
9
- RunResultStreaming,
10
- )
11
35
  from deepeval.tracing.tracing import Observer
12
36
  from deepeval.tracing.context import current_span_context, current_trace_context
13
37
 
14
38
  # Import observed provider/model helpers from our agent module
15
- from deepeval.openai_agents.agent import _ObservedProvider
39
+ from deepeval.metrics import BaseMetric
40
+ from deepeval.openai_agents.agent import _ObservedModel
16
41
 
42
+ _PATCHED_DEFAULT_GET_MODEL = False
17
43
 
18
- class Runner(BaseRunner):
19
- """
20
- Extends Runner to:
21
- - capture metric_collection/metrics at run entry for tracing
22
- - ensure RunConfig.model_provider is wrapped to return observed Models
23
- so string-based model lookups are also instrumented.
24
- """
44
+
45
+ def _patch_default_agent_runner_get_model():
46
+ global _PATCHED_DEFAULT_GET_MODEL
47
+ if _PATCHED_DEFAULT_GET_MODEL:
48
+ return
49
+
50
+ original_get_model = AgentRunner._get_model
25
51
 
26
52
  @classmethod
27
- async def run(cls, *args, **kwargs) -> RunResult:
28
- metric_collection = kwargs.pop("metric_collection", None)
29
- metrics = kwargs.pop("metrics", None)
53
+ def patched_get_model(
54
+ cls, agent: Agent[Any], run_config: RunConfig
55
+ ) -> Model:
56
+ model = original_get_model(agent, run_config)
30
57
 
31
- # Ensure the model provider is wrapped so _get_model(...) uses observed Models
32
- starting_agent = (
33
- args[0] if len(args) > 0 else kwargs.get("starting_agent")
58
+ # Extract attributes from agent if it's a DeepEvalAgent
59
+ llm_metrics = getattr(agent, "llm_metrics", None)
60
+ llm_metric_collection = getattr(agent, "llm_metric_collection", None)
61
+ confident_prompt = getattr(agent, "confident_prompt", None)
62
+ model = _ObservedModel(
63
+ inner=model,
64
+ llm_metric_collection=llm_metric_collection,
65
+ llm_metrics=llm_metrics,
66
+ confident_prompt=confident_prompt,
34
67
  )
35
- run_config: RunConfig | None = kwargs.get("run_config")
36
- if run_config is None:
37
- run_config = RunConfig()
38
- kwargs["run_config"] = run_config
39
-
40
- if run_config.model_provider is not None:
41
- run_config.model_provider = _ObservedProvider(
42
- run_config.model_provider,
43
- metrics=getattr(starting_agent, "metrics", None) or metrics,
44
- metric_collection=getattr(
45
- starting_agent, "metric_collection", None
46
- )
47
- or metric_collection,
48
- deepeval_prompt=getattr(
49
- starting_agent, "deepeval_prompt", None
50
- ),
51
- )
52
68
 
53
- input_val = args[1] if len(args) >= 2 else kwargs.get("input", None)
69
+ return model
70
+
71
+ # Replace the method
72
+ AgentRunner._get_model = patched_get_model
73
+ _PATCHED_DEFAULT_GET_MODEL = True
74
+
75
+
76
+ if agents_available:
77
+ _patch_default_agent_runner_get_model()
78
+
79
+
80
+ class Runner(AgentsRunner):
81
+
82
+ @classmethod
83
+ async def run(
84
+ cls,
85
+ starting_agent: Agent[TContext],
86
+ input: Union[str, list[TResponseInputItem]],
87
+ *,
88
+ context: Optional[TContext] = None,
89
+ max_turns: int = DEFAULT_MAX_TURNS,
90
+ hooks: Optional[RunHooks[TContext]] = None,
91
+ run_config: Optional[RunConfig] = None,
92
+ previous_response_id: Optional[str] = None,
93
+ conversation_id: Optional[str] = None,
94
+ session: Optional[Session] = None,
95
+ metrics: Optional[List[BaseMetric]] = None,
96
+ metric_collection: Optional[str] = None,
97
+ name: Optional[str] = None,
98
+ tags: Optional[List[str]] = None,
99
+ metadata: Optional[dict] = None,
100
+ thread_id: Optional[str] = None,
101
+ user_id: Optional[str] = None,
102
+ **kwargs, # backwards compatibility
103
+ ) -> RunResult:
104
+ is_agents_available()
105
+ # _patch_default_agent_runner_get_model()
106
+
54
107
  with Observer(
55
108
  span_type="custom",
56
109
  metric_collection=metric_collection,
57
110
  metrics=metrics,
58
111
  func_name="run",
59
- function_kwargs={"input": input_val},
112
+ function_kwargs={"input": input}, # also set below
60
113
  ) as observer:
114
+ update_trace_attributes(
115
+ name=name,
116
+ tags=tags,
117
+ metadata=metadata,
118
+ thread_id=thread_id,
119
+ user_id=user_id,
120
+ metric_collection=metric_collection,
121
+ metrics=metrics,
122
+ )
61
123
  current_span = current_span_context.get()
62
124
  current_trace = current_trace_context.get()
63
- current_trace.input = input_val
125
+ if not current_trace.input:
126
+ current_trace.input = input
64
127
  if current_span:
65
- current_span.input = input_val
66
- res = await super().run(*args, **kwargs)
67
- current_trace.output = str(res)
68
- observer.result = str(res)
128
+ current_span.input = input
129
+ res = await super().run(
130
+ starting_agent,
131
+ input,
132
+ context=context,
133
+ max_turns=max_turns,
134
+ hooks=hooks,
135
+ run_config=run_config,
136
+ previous_response_id=previous_response_id,
137
+ conversation_id=conversation_id,
138
+ session=session,
139
+ **kwargs, # backwards compatibility
140
+ )
141
+ current_trace_thread_id = current_trace_context.get().thread_id
142
+ _output = None
143
+ if current_trace_thread_id:
144
+ _output = res.final_output
145
+ else:
146
+ _output = str(res)
147
+ observer.result = _output
148
+ update_trace_attributes(output=_output)
69
149
  return res
70
150
 
71
151
  @classmethod
72
- def run_sync(cls, *args, **kwargs) -> RunResult:
73
- metric_collection = kwargs.pop("metric_collection", None)
74
- metrics = kwargs.pop("metrics", None)
152
+ def run_sync(
153
+ cls,
154
+ starting_agent: Agent[TContext],
155
+ input: Union[str, list[TResponseInputItem]],
156
+ *,
157
+ context: Optional[TContext] = None,
158
+ max_turns: int = DEFAULT_MAX_TURNS,
159
+ hooks: Optional[RunHooks[TContext]] = None,
160
+ run_config: Optional[RunConfig] = None,
161
+ previous_response_id: Optional[str] = None,
162
+ conversation_id: Optional[str] = None,
163
+ session: Optional[Session] = None,
164
+ metrics: Optional[List[BaseMetric]] = None,
165
+ metric_collection: Optional[str] = None,
166
+ name: Optional[str] = None,
167
+ tags: Optional[List[str]] = None,
168
+ metadata: Optional[dict] = None,
169
+ thread_id: Optional[str] = None,
170
+ user_id: Optional[str] = None,
171
+ **kwargs,
172
+ ) -> RunResult:
173
+ is_agents_available()
75
174
 
76
- starting_agent = (
77
- args[0] if len(args) > 0 else kwargs.get("starting_agent")
78
- )
79
- run_config: RunConfig | None = kwargs.get("run_config")
80
- if run_config is None:
81
- run_config = RunConfig()
82
- kwargs["run_config"] = run_config
83
-
84
- if run_config.model_provider is not None:
85
- run_config.model_provider = _ObservedProvider(
86
- run_config.model_provider,
87
- metrics=getattr(starting_agent, "metrics", None) or metrics,
88
- metric_collection=getattr(
89
- starting_agent, "metric_collection", None
90
- )
91
- or metric_collection,
92
- deepeval_prompt=getattr(
93
- starting_agent, "deepeval_prompt", None
94
- ),
95
- )
96
-
97
- input_val = args[1] if len(args) >= 2 else kwargs.get("input", None)
98
175
  with Observer(
99
176
  span_type="custom",
100
177
  metric_collection=metric_collection,
101
178
  metrics=metrics,
102
179
  func_name="run_sync",
103
- function_kwargs={"input": input_val},
180
+ function_kwargs={"input": input}, # also set below
104
181
  ) as observer:
182
+ update_trace_attributes(
183
+ name=name,
184
+ tags=tags,
185
+ metadata=metadata,
186
+ thread_id=thread_id,
187
+ user_id=user_id,
188
+ metric_collection=metric_collection,
189
+ metrics=metrics,
190
+ )
191
+
105
192
  current_span = current_span_context.get()
106
193
  current_trace = current_trace_context.get()
107
- current_trace.input = input_val
194
+ if not current_trace.input:
195
+ current_trace.input = input
108
196
  if current_span:
109
- current_span.input = input_val
110
- res = super().run_sync(*args, **kwargs)
111
- current_trace.output = str(res)
112
- observer.result = str(res)
197
+ current_span.input = input
198
+ res = super().run_sync(
199
+ starting_agent,
200
+ input,
201
+ context=context,
202
+ max_turns=max_turns,
203
+ hooks=hooks,
204
+ run_config=run_config,
205
+ previous_response_id=previous_response_id,
206
+ conversation_id=conversation_id,
207
+ session=session,
208
+ **kwargs, # backwards compatibility
209
+ )
210
+ current_trace_thread_id = current_trace_context.get().thread_id
211
+ _output = None
212
+ if current_trace_thread_id:
213
+ _output = res.final_output
214
+ else:
215
+ _output = str(res)
216
+ update_trace_attributes(output=_output)
217
+ observer.result = _output
113
218
 
114
219
  return res
220
+
221
+ @classmethod
222
+ def run_streamed(
223
+ cls,
224
+ starting_agent: Agent[TContext],
225
+ input: Union[str, list[TResponseInputItem]],
226
+ *,
227
+ context: Optional[TContext] = None,
228
+ max_turns: int = DEFAULT_MAX_TURNS,
229
+ hooks: Optional[RunHooks[TContext]] = None,
230
+ run_config: Optional[RunConfig] = None,
231
+ previous_response_id: Optional[str] = None,
232
+ conversation_id: Optional[str] = None,
233
+ session: Optional[Session] = None,
234
+ metrics: Optional[List[BaseMetric]] = None,
235
+ metric_collection: Optional[str] = None,
236
+ name: Optional[str] = None,
237
+ tags: Optional[List[str]] = None,
238
+ metadata: Optional[dict] = None,
239
+ thread_id: Optional[str] = None,
240
+ user_id: Optional[str] = None,
241
+ **kwargs, # backwards compatibility
242
+ ) -> RunResultStreaming:
243
+ is_agents_available()
244
+ # Manually enter observer; we'll exit when streaming finishes
245
+ observer = Observer(
246
+ span_type="custom",
247
+ metric_collection=metric_collection,
248
+ metrics=metrics,
249
+ func_name="run_streamed",
250
+ function_kwargs={"input": input},
251
+ )
252
+ observer.__enter__()
253
+
254
+ update_trace_attributes(
255
+ name=name,
256
+ tags=tags,
257
+ metadata=metadata,
258
+ thread_id=thread_id,
259
+ user_id=user_id,
260
+ metric_collection=metric_collection,
261
+ metrics=metrics,
262
+ )
263
+ current_trace = current_trace_context.get()
264
+ if not current_trace.input:
265
+ current_trace.input = input
266
+
267
+ current_span = current_span_context.get()
268
+ if current_span:
269
+ current_span.input = input
270
+
271
+ res = super().run_streamed(
272
+ starting_agent,
273
+ input,
274
+ context=context,
275
+ max_turns=max_turns,
276
+ hooks=hooks,
277
+ run_config=run_config,
278
+ previous_response_id=previous_response_id,
279
+ conversation_id=conversation_id,
280
+ session=session,
281
+ **kwargs, # backwards compatibility
282
+ )
283
+
284
+ # Runtime-patch stream_events so the observer closes only after streaming completes
285
+ orig_stream_events = res.stream_events
286
+
287
+ async def _patched_stream_events(self: RunResultStreaming):
288
+ try:
289
+ async for event in orig_stream_events():
290
+ yield event
291
+ observer.result = self.final_output
292
+ update_trace_attributes(output=self.final_output)
293
+ except Exception as e:
294
+ observer.__exit__(type(e), e, e.__traceback__)
295
+ raise
296
+ finally:
297
+ observer.__exit__(None, None, None)
298
+
299
+ from types import MethodType as _MethodType
300
+
301
+ res.stream_events = _MethodType(_patched_stream_events, res)
302
+
303
+ return res
304
+
305
+
306
+ def update_trace_attributes(
307
+ input: Any = None,
308
+ output: Any = None,
309
+ name: str = None,
310
+ tags: List[str] = None,
311
+ metadata: dict = None,
312
+ thread_id: str = None,
313
+ user_id: str = None,
314
+ metric_collection: str = None,
315
+ metrics: List[BaseMetric] = None,
316
+ ):
317
+ current_trace = current_trace_context.get()
318
+ if input:
319
+ current_trace.input = input
320
+ if output:
321
+ current_trace.output = output
322
+ if name:
323
+ current_trace.name = name
324
+ if tags:
325
+ current_trace.tags = tags
326
+ if metadata:
327
+ current_trace.metadata = metadata
328
+ if thread_id:
329
+ current_trace.thread_id = thread_id
330
+ if user_id:
331
+ current_trace.user_id = user_id
332
+ if metric_collection:
333
+ current_trace.metric_collection = metric_collection
334
+ if metrics:
335
+ current_trace.metrics = metrics
deepeval/scorer/scorer.py CHANGED
@@ -223,7 +223,7 @@ class Scorer:
223
223
  Right now we are using score_one method under the hood. Instead of scoring multiple predictions for faithfulness.
224
224
  """
225
225
  try:
226
- from deepeval.models import SummaCModels
226
+ from deepeval.models.summac_model import SummaCModels
227
227
  except Exception as e:
228
228
  print(f"SummaCZS model can not be loaded.\n{e}")
229
229
 
@@ -326,7 +326,7 @@ class Scorer:
326
326
  from sentence_transformers import util
327
327
 
328
328
  try:
329
- from deepeval.models import (
329
+ from deepeval.models.answer_relevancy_model import (
330
330
  AnswerRelevancyModel,
331
331
  CrossEncoderAnswerRelevancyModel,
332
332
  )
@@ -8,9 +8,6 @@ import random
8
8
  import atexit
9
9
  import queue
10
10
  import uuid
11
- import os
12
- import json
13
- import time
14
11
  from openai import OpenAI
15
12
  from rich.console import Console
16
13
  from rich.progress import Progress
@@ -496,6 +493,7 @@ class TraceManager:
496
493
  asyncio.gather(*pending, return_exceptions=True)
497
494
  )
498
495
  self.flush_traces(remaining_trace_request_bodies)
496
+ loop.run_until_complete(loop.shutdown_asyncgens())
499
497
  loop.close()
500
498
 
501
499
  def flush_traces(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.5.4
3
+ Version: 3.5.6
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -186,6 +186,8 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
186
186
 
187
187
  ## Installation
188
188
 
189
+ Deepeval works with **Python>=3.9+**.
190
+
189
191
  ```
190
192
  pip install -U deepeval
191
193
  ```