deepeval 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +182 -18
- deepeval/config/settings.py +14 -0
- deepeval/constants.py +2 -1
- deepeval/dataset/dataset.py +11 -4
- deepeval/dataset/types.py +19 -11
- deepeval/dataset/utils.py +31 -3
- deepeval/evaluate/execute.py +226 -23
- deepeval/openai_agents/agent.py +115 -106
- deepeval/openai_agents/callback_handler.py +65 -33
- deepeval/openai_agents/runner.py +296 -75
- deepeval/scorer/scorer.py +2 -2
- deepeval/tracing/tracing.py +1 -3
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/METADATA +3 -1
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/RECORD +18 -18
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/WHEEL +0 -0
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from deepeval.tracing.tracing import (
|
|
2
2
|
Observer,
|
|
3
3
|
current_span_context,
|
|
4
|
+
trace_manager,
|
|
4
5
|
)
|
|
5
6
|
from deepeval.openai_agents.extractors import *
|
|
6
7
|
from deepeval.tracing.context import current_trace_context
|
|
8
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
9
|
+
from time import perf_counter
|
|
10
|
+
from deepeval.tracing.types import TraceSpanStatus
|
|
7
11
|
|
|
8
12
|
try:
|
|
9
13
|
from agents.tracing import Span, Trace, TracingProcessor
|
|
@@ -33,30 +37,57 @@ def _check_openai_agents_available():
|
|
|
33
37
|
class DeepEvalTracingProcessor(TracingProcessor):
|
|
34
38
|
def __init__(self) -> None:
|
|
35
39
|
_check_openai_agents_available()
|
|
36
|
-
self.root_span_observers: dict[str, Observer] = {}
|
|
37
40
|
self.span_observers: dict[str, Observer] = {}
|
|
38
41
|
|
|
39
42
|
def on_trace_start(self, trace: "Trace") -> None:
|
|
40
|
-
|
|
43
|
+
trace_dict = trace.export()
|
|
44
|
+
_trace_uuid = trace_dict.get("id")
|
|
45
|
+
_thread_id = trace_dict.get("group_id")
|
|
46
|
+
_trace_name = trace_dict.get("workflow_name")
|
|
47
|
+
_trace_metadata = trace_dict.get("metadata")
|
|
48
|
+
|
|
49
|
+
if _thread_id or _trace_metadata:
|
|
50
|
+
_trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
|
|
51
|
+
_trace.thread_id = str(_thread_id)
|
|
52
|
+
_trace.name = str(_trace_name)
|
|
53
|
+
_trace.metadata = make_json_serializable(_trace_metadata)
|
|
54
|
+
current_trace_context.set(_trace)
|
|
55
|
+
|
|
56
|
+
trace_manager.add_span( # adds a dummy root span
|
|
57
|
+
BaseSpan(
|
|
58
|
+
uuid=_trace_uuid,
|
|
59
|
+
trace_uuid=_trace_uuid,
|
|
60
|
+
parent_uuid=None,
|
|
61
|
+
start_time=perf_counter(),
|
|
62
|
+
name=_trace_name,
|
|
63
|
+
status=TraceSpanStatus.IN_PROGRESS,
|
|
64
|
+
children=[],
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
current_trace = current_trace_context.get()
|
|
69
|
+
if current_trace:
|
|
70
|
+
current_trace.name = str(_trace_name)
|
|
41
71
|
|
|
42
72
|
def on_trace_end(self, trace: "Trace") -> None:
|
|
43
|
-
|
|
73
|
+
trace_dict = trace.export()
|
|
74
|
+
_trace_uuid = trace_dict.get("id")
|
|
75
|
+
_thread_id = trace_dict.get("group_id")
|
|
76
|
+
_trace_name = trace_dict.get("workflow_name")
|
|
77
|
+
_trace_metadata = trace_dict.get("metadata")
|
|
78
|
+
|
|
79
|
+
if _thread_id or _trace_metadata:
|
|
80
|
+
trace_manager.remove_span(
|
|
81
|
+
_trace_uuid
|
|
82
|
+
) # removing the dummy root span
|
|
83
|
+
trace_manager.end_trace(_trace_uuid)
|
|
84
|
+
current_trace_context.set(None)
|
|
44
85
|
|
|
45
86
|
def on_span_start(self, span: "Span") -> None:
|
|
46
87
|
if not span.started_at:
|
|
47
88
|
return
|
|
48
89
|
span_type = self.get_span_kind(span.span_data)
|
|
49
|
-
if span_type == "agent":
|
|
50
|
-
if isinstance(span.span_data, AgentSpanData):
|
|
51
|
-
current_trace = current_trace_context.get()
|
|
52
|
-
if current_trace:
|
|
53
|
-
current_trace.name = span.span_data.name
|
|
54
|
-
|
|
55
|
-
if span_type == "tool":
|
|
56
|
-
return
|
|
57
|
-
elif span_type == "llm":
|
|
58
|
-
return
|
|
59
|
-
else:
|
|
90
|
+
if span_type and span_type == "agent":
|
|
60
91
|
observer = Observer(span_type=span_type, func_name="NA")
|
|
61
92
|
observer.update_span_properties = (
|
|
62
93
|
lambda base_span: update_span_properties(
|
|
@@ -68,13 +99,13 @@ class DeepEvalTracingProcessor(TracingProcessor):
|
|
|
68
99
|
|
|
69
100
|
def on_span_end(self, span: "Span") -> None:
|
|
70
101
|
span_type = self.get_span_kind(span.span_data)
|
|
71
|
-
if span_type == "
|
|
102
|
+
if span_type and span_type == "agent":
|
|
72
103
|
current_span = current_span_context.get()
|
|
73
104
|
if current_span:
|
|
74
105
|
update_span_properties(current_span, span.span_data)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
106
|
+
observer = self.span_observers.pop(span.span_id, None)
|
|
107
|
+
if observer:
|
|
108
|
+
observer.__exit__(None, None, None)
|
|
78
109
|
|
|
79
110
|
def force_flush(self) -> None:
|
|
80
111
|
pass
|
|
@@ -85,18 +116,19 @@ class DeepEvalTracingProcessor(TracingProcessor):
|
|
|
85
116
|
def get_span_kind(self, span_data: "SpanData") -> str:
|
|
86
117
|
if isinstance(span_data, AgentSpanData):
|
|
87
118
|
return "agent"
|
|
88
|
-
if isinstance(span_data, FunctionSpanData):
|
|
89
|
-
|
|
90
|
-
if isinstance(span_data, MCPListToolsSpanData):
|
|
91
|
-
|
|
92
|
-
if isinstance(span_data, GenerationSpanData):
|
|
93
|
-
|
|
94
|
-
if isinstance(span_data, ResponseSpanData):
|
|
95
|
-
|
|
96
|
-
if isinstance(span_data, HandoffSpanData):
|
|
97
|
-
|
|
98
|
-
if isinstance(span_data, CustomSpanData):
|
|
99
|
-
|
|
100
|
-
if isinstance(span_data, GuardrailSpanData):
|
|
101
|
-
|
|
102
|
-
return "base"
|
|
119
|
+
# if isinstance(span_data, FunctionSpanData):
|
|
120
|
+
# return "tool"
|
|
121
|
+
# if isinstance(span_data, MCPListToolsSpanData):
|
|
122
|
+
# return "tool"
|
|
123
|
+
# if isinstance(span_data, GenerationSpanData):
|
|
124
|
+
# return "llm"
|
|
125
|
+
# if isinstance(span_data, ResponseSpanData):
|
|
126
|
+
# return "llm"
|
|
127
|
+
# if isinstance(span_data, HandoffSpanData):
|
|
128
|
+
# return "custom"
|
|
129
|
+
# if isinstance(span_data, CustomSpanData):
|
|
130
|
+
# return "base"
|
|
131
|
+
# if isinstance(span_data, GuardrailSpanData):
|
|
132
|
+
# return "base"
|
|
133
|
+
# return "base"
|
|
134
|
+
return None
|
deepeval/openai_agents/runner.py
CHANGED
|
@@ -1,114 +1,335 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import replace
|
|
4
|
+
from typing import List, Any, Union, Optional
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from agents import (
|
|
8
|
+
RunConfig,
|
|
9
|
+
RunResult,
|
|
10
|
+
RunResultStreaming,
|
|
11
|
+
Runner as AgentsRunner,
|
|
12
|
+
)
|
|
13
|
+
from agents.agent import Agent
|
|
14
|
+
from agents.models.interface import ModelProvider
|
|
15
|
+
from agents.items import TResponseInputItem
|
|
16
|
+
from agents.lifecycle import RunHooks
|
|
17
|
+
from agents.memory import Session
|
|
18
|
+
from agents.run import DEFAULT_MAX_TURNS
|
|
19
|
+
from agents.run import AgentRunner
|
|
20
|
+
from agents.run_context import TContext
|
|
21
|
+
from agents.models.interface import Model
|
|
22
|
+
|
|
23
|
+
agents_available = True
|
|
24
|
+
except:
|
|
25
|
+
agents_available = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_agents_available():
|
|
29
|
+
if not agents_available:
|
|
30
|
+
raise ImportError(
|
|
31
|
+
"agents is required for this integration. Install it via your package manager"
|
|
32
|
+
)
|
|
33
|
+
|
|
4
34
|
|
|
5
|
-
from agents import (
|
|
6
|
-
Runner as BaseRunner,
|
|
7
|
-
RunConfig,
|
|
8
|
-
RunResult,
|
|
9
|
-
RunResultStreaming,
|
|
10
|
-
)
|
|
11
35
|
from deepeval.tracing.tracing import Observer
|
|
12
36
|
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
13
37
|
|
|
14
38
|
# Import observed provider/model helpers from our agent module
|
|
15
|
-
from deepeval.
|
|
39
|
+
from deepeval.metrics import BaseMetric
|
|
40
|
+
from deepeval.openai_agents.agent import _ObservedModel
|
|
16
41
|
|
|
42
|
+
_PATCHED_DEFAULT_GET_MODEL = False
|
|
17
43
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
44
|
+
|
|
45
|
+
def _patch_default_agent_runner_get_model():
|
|
46
|
+
global _PATCHED_DEFAULT_GET_MODEL
|
|
47
|
+
if _PATCHED_DEFAULT_GET_MODEL:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
original_get_model = AgentRunner._get_model
|
|
25
51
|
|
|
26
52
|
@classmethod
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
53
|
+
def patched_get_model(
|
|
54
|
+
cls, agent: Agent[Any], run_config: RunConfig
|
|
55
|
+
) -> Model:
|
|
56
|
+
model = original_get_model(agent, run_config)
|
|
30
57
|
|
|
31
|
-
#
|
|
32
|
-
|
|
33
|
-
|
|
58
|
+
# Extract attributes from agent if it's a DeepEvalAgent
|
|
59
|
+
llm_metrics = getattr(agent, "llm_metrics", None)
|
|
60
|
+
llm_metric_collection = getattr(agent, "llm_metric_collection", None)
|
|
61
|
+
confident_prompt = getattr(agent, "confident_prompt", None)
|
|
62
|
+
model = _ObservedModel(
|
|
63
|
+
inner=model,
|
|
64
|
+
llm_metric_collection=llm_metric_collection,
|
|
65
|
+
llm_metrics=llm_metrics,
|
|
66
|
+
confident_prompt=confident_prompt,
|
|
34
67
|
)
|
|
35
|
-
run_config: RunConfig | None = kwargs.get("run_config")
|
|
36
|
-
if run_config is None:
|
|
37
|
-
run_config = RunConfig()
|
|
38
|
-
kwargs["run_config"] = run_config
|
|
39
|
-
|
|
40
|
-
if run_config.model_provider is not None:
|
|
41
|
-
run_config.model_provider = _ObservedProvider(
|
|
42
|
-
run_config.model_provider,
|
|
43
|
-
metrics=getattr(starting_agent, "metrics", None) or metrics,
|
|
44
|
-
metric_collection=getattr(
|
|
45
|
-
starting_agent, "metric_collection", None
|
|
46
|
-
)
|
|
47
|
-
or metric_collection,
|
|
48
|
-
deepeval_prompt=getattr(
|
|
49
|
-
starting_agent, "deepeval_prompt", None
|
|
50
|
-
),
|
|
51
|
-
)
|
|
52
68
|
|
|
53
|
-
|
|
69
|
+
return model
|
|
70
|
+
|
|
71
|
+
# Replace the method
|
|
72
|
+
AgentRunner._get_model = patched_get_model
|
|
73
|
+
_PATCHED_DEFAULT_GET_MODEL = True
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if agents_available:
|
|
77
|
+
_patch_default_agent_runner_get_model()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class Runner(AgentsRunner):
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
async def run(
|
|
84
|
+
cls,
|
|
85
|
+
starting_agent: Agent[TContext],
|
|
86
|
+
input: Union[str, list[TResponseInputItem]],
|
|
87
|
+
*,
|
|
88
|
+
context: Optional[TContext] = None,
|
|
89
|
+
max_turns: int = DEFAULT_MAX_TURNS,
|
|
90
|
+
hooks: Optional[RunHooks[TContext]] = None,
|
|
91
|
+
run_config: Optional[RunConfig] = None,
|
|
92
|
+
previous_response_id: Optional[str] = None,
|
|
93
|
+
conversation_id: Optional[str] = None,
|
|
94
|
+
session: Optional[Session] = None,
|
|
95
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
96
|
+
metric_collection: Optional[str] = None,
|
|
97
|
+
name: Optional[str] = None,
|
|
98
|
+
tags: Optional[List[str]] = None,
|
|
99
|
+
metadata: Optional[dict] = None,
|
|
100
|
+
thread_id: Optional[str] = None,
|
|
101
|
+
user_id: Optional[str] = None,
|
|
102
|
+
**kwargs, # backwards compatibility
|
|
103
|
+
) -> RunResult:
|
|
104
|
+
is_agents_available()
|
|
105
|
+
# _patch_default_agent_runner_get_model()
|
|
106
|
+
|
|
54
107
|
with Observer(
|
|
55
108
|
span_type="custom",
|
|
56
109
|
metric_collection=metric_collection,
|
|
57
110
|
metrics=metrics,
|
|
58
111
|
func_name="run",
|
|
59
|
-
function_kwargs={"input":
|
|
112
|
+
function_kwargs={"input": input}, # also set below
|
|
60
113
|
) as observer:
|
|
114
|
+
update_trace_attributes(
|
|
115
|
+
name=name,
|
|
116
|
+
tags=tags,
|
|
117
|
+
metadata=metadata,
|
|
118
|
+
thread_id=thread_id,
|
|
119
|
+
user_id=user_id,
|
|
120
|
+
metric_collection=metric_collection,
|
|
121
|
+
metrics=metrics,
|
|
122
|
+
)
|
|
61
123
|
current_span = current_span_context.get()
|
|
62
124
|
current_trace = current_trace_context.get()
|
|
63
|
-
current_trace.input
|
|
125
|
+
if not current_trace.input:
|
|
126
|
+
current_trace.input = input
|
|
64
127
|
if current_span:
|
|
65
|
-
current_span.input =
|
|
66
|
-
res = await super().run(
|
|
67
|
-
|
|
68
|
-
|
|
128
|
+
current_span.input = input
|
|
129
|
+
res = await super().run(
|
|
130
|
+
starting_agent,
|
|
131
|
+
input,
|
|
132
|
+
context=context,
|
|
133
|
+
max_turns=max_turns,
|
|
134
|
+
hooks=hooks,
|
|
135
|
+
run_config=run_config,
|
|
136
|
+
previous_response_id=previous_response_id,
|
|
137
|
+
conversation_id=conversation_id,
|
|
138
|
+
session=session,
|
|
139
|
+
**kwargs, # backwards compatibility
|
|
140
|
+
)
|
|
141
|
+
current_trace_thread_id = current_trace_context.get().thread_id
|
|
142
|
+
_output = None
|
|
143
|
+
if current_trace_thread_id:
|
|
144
|
+
_output = res.final_output
|
|
145
|
+
else:
|
|
146
|
+
_output = str(res)
|
|
147
|
+
observer.result = _output
|
|
148
|
+
update_trace_attributes(output=_output)
|
|
69
149
|
return res
|
|
70
150
|
|
|
71
151
|
@classmethod
|
|
72
|
-
def run_sync(
|
|
73
|
-
|
|
74
|
-
|
|
152
|
+
def run_sync(
|
|
153
|
+
cls,
|
|
154
|
+
starting_agent: Agent[TContext],
|
|
155
|
+
input: Union[str, list[TResponseInputItem]],
|
|
156
|
+
*,
|
|
157
|
+
context: Optional[TContext] = None,
|
|
158
|
+
max_turns: int = DEFAULT_MAX_TURNS,
|
|
159
|
+
hooks: Optional[RunHooks[TContext]] = None,
|
|
160
|
+
run_config: Optional[RunConfig] = None,
|
|
161
|
+
previous_response_id: Optional[str] = None,
|
|
162
|
+
conversation_id: Optional[str] = None,
|
|
163
|
+
session: Optional[Session] = None,
|
|
164
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
165
|
+
metric_collection: Optional[str] = None,
|
|
166
|
+
name: Optional[str] = None,
|
|
167
|
+
tags: Optional[List[str]] = None,
|
|
168
|
+
metadata: Optional[dict] = None,
|
|
169
|
+
thread_id: Optional[str] = None,
|
|
170
|
+
user_id: Optional[str] = None,
|
|
171
|
+
**kwargs,
|
|
172
|
+
) -> RunResult:
|
|
173
|
+
is_agents_available()
|
|
75
174
|
|
|
76
|
-
starting_agent = (
|
|
77
|
-
args[0] if len(args) > 0 else kwargs.get("starting_agent")
|
|
78
|
-
)
|
|
79
|
-
run_config: RunConfig | None = kwargs.get("run_config")
|
|
80
|
-
if run_config is None:
|
|
81
|
-
run_config = RunConfig()
|
|
82
|
-
kwargs["run_config"] = run_config
|
|
83
|
-
|
|
84
|
-
if run_config.model_provider is not None:
|
|
85
|
-
run_config.model_provider = _ObservedProvider(
|
|
86
|
-
run_config.model_provider,
|
|
87
|
-
metrics=getattr(starting_agent, "metrics", None) or metrics,
|
|
88
|
-
metric_collection=getattr(
|
|
89
|
-
starting_agent, "metric_collection", None
|
|
90
|
-
)
|
|
91
|
-
or metric_collection,
|
|
92
|
-
deepeval_prompt=getattr(
|
|
93
|
-
starting_agent, "deepeval_prompt", None
|
|
94
|
-
),
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
input_val = args[1] if len(args) >= 2 else kwargs.get("input", None)
|
|
98
175
|
with Observer(
|
|
99
176
|
span_type="custom",
|
|
100
177
|
metric_collection=metric_collection,
|
|
101
178
|
metrics=metrics,
|
|
102
179
|
func_name="run_sync",
|
|
103
|
-
function_kwargs={"input":
|
|
180
|
+
function_kwargs={"input": input}, # also set below
|
|
104
181
|
) as observer:
|
|
182
|
+
update_trace_attributes(
|
|
183
|
+
name=name,
|
|
184
|
+
tags=tags,
|
|
185
|
+
metadata=metadata,
|
|
186
|
+
thread_id=thread_id,
|
|
187
|
+
user_id=user_id,
|
|
188
|
+
metric_collection=metric_collection,
|
|
189
|
+
metrics=metrics,
|
|
190
|
+
)
|
|
191
|
+
|
|
105
192
|
current_span = current_span_context.get()
|
|
106
193
|
current_trace = current_trace_context.get()
|
|
107
|
-
current_trace.input
|
|
194
|
+
if not current_trace.input:
|
|
195
|
+
current_trace.input = input
|
|
108
196
|
if current_span:
|
|
109
|
-
current_span.input =
|
|
110
|
-
res = super().run_sync(
|
|
111
|
-
|
|
112
|
-
|
|
197
|
+
current_span.input = input
|
|
198
|
+
res = super().run_sync(
|
|
199
|
+
starting_agent,
|
|
200
|
+
input,
|
|
201
|
+
context=context,
|
|
202
|
+
max_turns=max_turns,
|
|
203
|
+
hooks=hooks,
|
|
204
|
+
run_config=run_config,
|
|
205
|
+
previous_response_id=previous_response_id,
|
|
206
|
+
conversation_id=conversation_id,
|
|
207
|
+
session=session,
|
|
208
|
+
**kwargs, # backwards compatibility
|
|
209
|
+
)
|
|
210
|
+
current_trace_thread_id = current_trace_context.get().thread_id
|
|
211
|
+
_output = None
|
|
212
|
+
if current_trace_thread_id:
|
|
213
|
+
_output = res.final_output
|
|
214
|
+
else:
|
|
215
|
+
_output = str(res)
|
|
216
|
+
update_trace_attributes(output=_output)
|
|
217
|
+
observer.result = _output
|
|
113
218
|
|
|
114
219
|
return res
|
|
220
|
+
|
|
221
|
+
@classmethod
|
|
222
|
+
def run_streamed(
|
|
223
|
+
cls,
|
|
224
|
+
starting_agent: Agent[TContext],
|
|
225
|
+
input: Union[str, list[TResponseInputItem]],
|
|
226
|
+
*,
|
|
227
|
+
context: Optional[TContext] = None,
|
|
228
|
+
max_turns: int = DEFAULT_MAX_TURNS,
|
|
229
|
+
hooks: Optional[RunHooks[TContext]] = None,
|
|
230
|
+
run_config: Optional[RunConfig] = None,
|
|
231
|
+
previous_response_id: Optional[str] = None,
|
|
232
|
+
conversation_id: Optional[str] = None,
|
|
233
|
+
session: Optional[Session] = None,
|
|
234
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
235
|
+
metric_collection: Optional[str] = None,
|
|
236
|
+
name: Optional[str] = None,
|
|
237
|
+
tags: Optional[List[str]] = None,
|
|
238
|
+
metadata: Optional[dict] = None,
|
|
239
|
+
thread_id: Optional[str] = None,
|
|
240
|
+
user_id: Optional[str] = None,
|
|
241
|
+
**kwargs, # backwards compatibility
|
|
242
|
+
) -> RunResultStreaming:
|
|
243
|
+
is_agents_available()
|
|
244
|
+
# Manually enter observer; we'll exit when streaming finishes
|
|
245
|
+
observer = Observer(
|
|
246
|
+
span_type="custom",
|
|
247
|
+
metric_collection=metric_collection,
|
|
248
|
+
metrics=metrics,
|
|
249
|
+
func_name="run_streamed",
|
|
250
|
+
function_kwargs={"input": input},
|
|
251
|
+
)
|
|
252
|
+
observer.__enter__()
|
|
253
|
+
|
|
254
|
+
update_trace_attributes(
|
|
255
|
+
name=name,
|
|
256
|
+
tags=tags,
|
|
257
|
+
metadata=metadata,
|
|
258
|
+
thread_id=thread_id,
|
|
259
|
+
user_id=user_id,
|
|
260
|
+
metric_collection=metric_collection,
|
|
261
|
+
metrics=metrics,
|
|
262
|
+
)
|
|
263
|
+
current_trace = current_trace_context.get()
|
|
264
|
+
if not current_trace.input:
|
|
265
|
+
current_trace.input = input
|
|
266
|
+
|
|
267
|
+
current_span = current_span_context.get()
|
|
268
|
+
if current_span:
|
|
269
|
+
current_span.input = input
|
|
270
|
+
|
|
271
|
+
res = super().run_streamed(
|
|
272
|
+
starting_agent,
|
|
273
|
+
input,
|
|
274
|
+
context=context,
|
|
275
|
+
max_turns=max_turns,
|
|
276
|
+
hooks=hooks,
|
|
277
|
+
run_config=run_config,
|
|
278
|
+
previous_response_id=previous_response_id,
|
|
279
|
+
conversation_id=conversation_id,
|
|
280
|
+
session=session,
|
|
281
|
+
**kwargs, # backwards compatibility
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Runtime-patch stream_events so the observer closes only after streaming completes
|
|
285
|
+
orig_stream_events = res.stream_events
|
|
286
|
+
|
|
287
|
+
async def _patched_stream_events(self: RunResultStreaming):
|
|
288
|
+
try:
|
|
289
|
+
async for event in orig_stream_events():
|
|
290
|
+
yield event
|
|
291
|
+
observer.result = self.final_output
|
|
292
|
+
update_trace_attributes(output=self.final_output)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
observer.__exit__(type(e), e, e.__traceback__)
|
|
295
|
+
raise
|
|
296
|
+
finally:
|
|
297
|
+
observer.__exit__(None, None, None)
|
|
298
|
+
|
|
299
|
+
from types import MethodType as _MethodType
|
|
300
|
+
|
|
301
|
+
res.stream_events = _MethodType(_patched_stream_events, res)
|
|
302
|
+
|
|
303
|
+
return res
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def update_trace_attributes(
|
|
307
|
+
input: Any = None,
|
|
308
|
+
output: Any = None,
|
|
309
|
+
name: str = None,
|
|
310
|
+
tags: List[str] = None,
|
|
311
|
+
metadata: dict = None,
|
|
312
|
+
thread_id: str = None,
|
|
313
|
+
user_id: str = None,
|
|
314
|
+
metric_collection: str = None,
|
|
315
|
+
metrics: List[BaseMetric] = None,
|
|
316
|
+
):
|
|
317
|
+
current_trace = current_trace_context.get()
|
|
318
|
+
if input:
|
|
319
|
+
current_trace.input = input
|
|
320
|
+
if output:
|
|
321
|
+
current_trace.output = output
|
|
322
|
+
if name:
|
|
323
|
+
current_trace.name = name
|
|
324
|
+
if tags:
|
|
325
|
+
current_trace.tags = tags
|
|
326
|
+
if metadata:
|
|
327
|
+
current_trace.metadata = metadata
|
|
328
|
+
if thread_id:
|
|
329
|
+
current_trace.thread_id = thread_id
|
|
330
|
+
if user_id:
|
|
331
|
+
current_trace.user_id = user_id
|
|
332
|
+
if metric_collection:
|
|
333
|
+
current_trace.metric_collection = metric_collection
|
|
334
|
+
if metrics:
|
|
335
|
+
current_trace.metrics = metrics
|
deepeval/scorer/scorer.py
CHANGED
|
@@ -223,7 +223,7 @@ class Scorer:
|
|
|
223
223
|
Right now we are using score_one method under the hood. Instead of scoring multiple predictions for faithfulness.
|
|
224
224
|
"""
|
|
225
225
|
try:
|
|
226
|
-
from deepeval.models import SummaCModels
|
|
226
|
+
from deepeval.models.summac_model import SummaCModels
|
|
227
227
|
except Exception as e:
|
|
228
228
|
print(f"SummaCZS model can not be loaded.\n{e}")
|
|
229
229
|
|
|
@@ -326,7 +326,7 @@ class Scorer:
|
|
|
326
326
|
from sentence_transformers import util
|
|
327
327
|
|
|
328
328
|
try:
|
|
329
|
-
from deepeval.models import (
|
|
329
|
+
from deepeval.models.answer_relevancy_model import (
|
|
330
330
|
AnswerRelevancyModel,
|
|
331
331
|
CrossEncoderAnswerRelevancyModel,
|
|
332
332
|
)
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -8,9 +8,6 @@ import random
|
|
|
8
8
|
import atexit
|
|
9
9
|
import queue
|
|
10
10
|
import uuid
|
|
11
|
-
import os
|
|
12
|
-
import json
|
|
13
|
-
import time
|
|
14
11
|
from openai import OpenAI
|
|
15
12
|
from rich.console import Console
|
|
16
13
|
from rich.progress import Progress
|
|
@@ -496,6 +493,7 @@ class TraceManager:
|
|
|
496
493
|
asyncio.gather(*pending, return_exceptions=True)
|
|
497
494
|
)
|
|
498
495
|
self.flush_traces(remaining_trace_request_bodies)
|
|
496
|
+
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
499
497
|
loop.close()
|
|
500
498
|
|
|
501
499
|
def flush_traces(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.6
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -186,6 +186,8 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
|
|
|
186
186
|
|
|
187
187
|
## Installation
|
|
188
188
|
|
|
189
|
+
Deepeval works with **Python>=3.9+**.
|
|
190
|
+
|
|
189
191
|
```
|
|
190
192
|
pip install -U deepeval
|
|
191
193
|
```
|