deepeval 3.5.1__py3-none-any.whl → 3.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +94 -2
- deepeval/config/utils.py +54 -1
- deepeval/constants.py +27 -0
- deepeval/integrations/langchain/__init__.py +2 -3
- deepeval/integrations/langchain/callback.py +126 -301
- deepeval/integrations/langchain/patch.py +24 -13
- deepeval/integrations/langchain/utils.py +203 -1
- deepeval/integrations/pydantic_ai/patcher.py +220 -185
- deepeval/integrations/pydantic_ai/utils.py +86 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +1 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +1 -1
- deepeval/models/embedding_models/azure_embedding_model.py +40 -9
- deepeval/models/embedding_models/local_embedding_model.py +54 -11
- deepeval/models/embedding_models/ollama_embedding_model.py +25 -7
- deepeval/models/embedding_models/openai_embedding_model.py +47 -5
- deepeval/models/llms/amazon_bedrock_model.py +31 -4
- deepeval/models/llms/anthropic_model.py +39 -13
- deepeval/models/llms/azure_model.py +37 -38
- deepeval/models/llms/deepseek_model.py +36 -7
- deepeval/models/llms/gemini_model.py +10 -0
- deepeval/models/llms/grok_model.py +50 -3
- deepeval/models/llms/kimi_model.py +37 -7
- deepeval/models/llms/local_model.py +38 -12
- deepeval/models/llms/ollama_model.py +15 -3
- deepeval/models/llms/openai_model.py +37 -44
- deepeval/models/mlllms/gemini_model.py +21 -3
- deepeval/models/mlllms/ollama_model.py +38 -13
- deepeval/models/mlllms/openai_model.py +18 -42
- deepeval/models/retry_policy.py +548 -64
- deepeval/prompt/api.py +13 -9
- deepeval/prompt/prompt.py +19 -9
- deepeval/tracing/tracing.py +87 -0
- deepeval/utils.py +12 -0
- {deepeval-3.5.1.dist-info → deepeval-3.5.3.dist-info}/METADATA +1 -1
- {deepeval-3.5.1.dist-info → deepeval-3.5.3.dist-info}/RECORD +39 -38
- {deepeval-3.5.1.dist-info → deepeval-3.5.3.dist-info}/LICENSE.md +0 -0
- {deepeval-3.5.1.dist-info → deepeval-3.5.3.dist-info}/WHEEL +0 -0
- {deepeval-3.5.1.dist-info → deepeval-3.5.3.dist-info}/entry_points.txt +0 -0
|
@@ -12,6 +12,8 @@ from deepeval.confident.api import get_confident_api_key
|
|
|
12
12
|
from deepeval.integrations.pydantic_ai.otel import instrument_pydantic_ai
|
|
13
13
|
from deepeval.telemetry import capture_tracing_integration
|
|
14
14
|
from deepeval.prompt import Prompt
|
|
15
|
+
import inspect
|
|
16
|
+
from contextvars import ContextVar
|
|
15
17
|
|
|
16
18
|
try:
|
|
17
19
|
from pydantic_ai.agent import Agent
|
|
@@ -26,11 +28,68 @@ try:
|
|
|
26
28
|
ToolReturnPart,
|
|
27
29
|
UserPromptPart,
|
|
28
30
|
)
|
|
31
|
+
from pydantic_ai._run_context import RunContext
|
|
32
|
+
from deepeval.integrations.pydantic_ai.utils import (
|
|
33
|
+
extract_tools_called_from_llm_response,
|
|
34
|
+
extract_tools_called,
|
|
35
|
+
sanitize_run_context,
|
|
36
|
+
)
|
|
29
37
|
|
|
30
38
|
pydantic_ai_installed = True
|
|
31
39
|
except:
|
|
32
40
|
pydantic_ai_installed = True
|
|
33
41
|
|
|
42
|
+
_IN_RUN_SYNC = ContextVar("deepeval_in_run_sync", default=False)
|
|
43
|
+
_INSTRUMENTED = False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def instrument(otel: Optional[bool] = False, api_key: Optional[str] = None):
|
|
47
|
+
global _INSTRUMENTED
|
|
48
|
+
if api_key:
|
|
49
|
+
deepeval.login(api_key)
|
|
50
|
+
|
|
51
|
+
api_key = get_confident_api_key()
|
|
52
|
+
|
|
53
|
+
if not api_key:
|
|
54
|
+
raise ValueError("No api key provided.")
|
|
55
|
+
|
|
56
|
+
if otel:
|
|
57
|
+
instrument_pydantic_ai(api_key)
|
|
58
|
+
else:
|
|
59
|
+
with capture_tracing_integration("pydantic_ai"):
|
|
60
|
+
if _INSTRUMENTED:
|
|
61
|
+
return
|
|
62
|
+
_patch_agent_init()
|
|
63
|
+
_patch_agent_tool_decorator()
|
|
64
|
+
_INSTRUMENTED = True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
################### Init Patches ###################
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _patch_agent_init():
|
|
71
|
+
original_init = Agent.__init__
|
|
72
|
+
|
|
73
|
+
@functools.wraps(original_init)
|
|
74
|
+
def wrapper(
|
|
75
|
+
*args,
|
|
76
|
+
llm_metric_collection: Optional[str] = None,
|
|
77
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
78
|
+
llm_prompt: Optional[Prompt] = None,
|
|
79
|
+
agent_metric_collection: Optional[str] = None,
|
|
80
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
81
|
+
**kwargs
|
|
82
|
+
):
|
|
83
|
+
result = original_init(*args, **kwargs)
|
|
84
|
+
_patch_llm_model(
|
|
85
|
+
args[0]._model, llm_metric_collection, llm_metrics, llm_prompt
|
|
86
|
+
) # runtime patch of the model
|
|
87
|
+
_patch_agent_run(args[0], agent_metric_collection, agent_metrics)
|
|
88
|
+
_patch_agent_run_sync(args[0], agent_metric_collection, agent_metrics)
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
Agent.__init__ = wrapper
|
|
92
|
+
|
|
34
93
|
|
|
35
94
|
def _patch_agent_tool_decorator():
|
|
36
95
|
original_tool = Agent.tool
|
|
@@ -64,101 +123,101 @@ def _patch_agent_tool_decorator():
|
|
|
64
123
|
Agent.tool = wrapper
|
|
65
124
|
|
|
66
125
|
|
|
67
|
-
|
|
68
|
-
func: Callable,
|
|
69
|
-
metrics: Optional[List[BaseMetric]] = None,
|
|
70
|
-
metric_collection: Optional[str] = None,
|
|
71
|
-
):
|
|
72
|
-
import asyncio
|
|
73
|
-
|
|
74
|
-
original_func = func
|
|
75
|
-
|
|
76
|
-
is_async = asyncio.iscoroutinefunction(original_func)
|
|
126
|
+
################### Runtime Patches ###################
|
|
77
127
|
|
|
78
|
-
if is_async:
|
|
79
128
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
metric_collection=metric_collection,
|
|
87
|
-
function_kwargs={"args": args, **kwargs},
|
|
88
|
-
) as observer:
|
|
89
|
-
result = await original_func(*args, **kwargs)
|
|
90
|
-
observer.result = result
|
|
91
|
-
|
|
92
|
-
return result
|
|
129
|
+
def _patch_agent_run_sync(
|
|
130
|
+
agent: Agent,
|
|
131
|
+
agent_metric_collection: Optional[str] = None,
|
|
132
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
133
|
+
):
|
|
134
|
+
original_run_sync = agent.run_sync
|
|
93
135
|
|
|
94
|
-
|
|
95
|
-
|
|
136
|
+
@functools.wraps(original_run_sync)
|
|
137
|
+
def wrapper(
|
|
138
|
+
*args,
|
|
139
|
+
metric_collection: Optional[str] = None,
|
|
140
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
141
|
+
name: Optional[str] = None,
|
|
142
|
+
tags: Optional[List[str]] = None,
|
|
143
|
+
metadata: Optional[dict] = None,
|
|
144
|
+
thread_id: Optional[str] = None,
|
|
145
|
+
user_id: Optional[str] = None,
|
|
146
|
+
**kwargs
|
|
147
|
+
):
|
|
96
148
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
func_name=original_func.__name__,
|
|
102
|
-
metrics=metrics,
|
|
103
|
-
metric_collection=metric_collection,
|
|
104
|
-
function_kwargs={"args": args, **kwargs},
|
|
105
|
-
) as observer:
|
|
106
|
-
result = original_func(*args, **kwargs)
|
|
107
|
-
observer.result = result
|
|
149
|
+
sig = inspect.signature(original_run_sync)
|
|
150
|
+
bound = sig.bind_partial(*args, **kwargs)
|
|
151
|
+
bound.apply_defaults()
|
|
152
|
+
input = bound.arguments.get("user_prompt", None)
|
|
108
153
|
|
|
109
|
-
|
|
154
|
+
with Observer(
|
|
155
|
+
span_type="agent",
|
|
156
|
+
func_name="Agent",
|
|
157
|
+
function_kwargs={"input": input},
|
|
158
|
+
metrics=agent_metrics,
|
|
159
|
+
metric_collection=agent_metric_collection,
|
|
160
|
+
) as observer:
|
|
110
161
|
|
|
111
|
-
|
|
162
|
+
token = _IN_RUN_SYNC.set(True)
|
|
163
|
+
try:
|
|
164
|
+
result = original_run_sync(*args, **kwargs)
|
|
165
|
+
finally:
|
|
166
|
+
_IN_RUN_SYNC.reset(token)
|
|
112
167
|
|
|
168
|
+
observer.update_span_properties = (
|
|
169
|
+
lambda agent_span: set_agent_span_attributes(agent_span, result)
|
|
170
|
+
)
|
|
171
|
+
observer.result = result.output
|
|
113
172
|
|
|
114
|
-
|
|
115
|
-
|
|
173
|
+
_update_trace_context(
|
|
174
|
+
trace_name=name,
|
|
175
|
+
trace_tags=tags,
|
|
176
|
+
trace_metadata=metadata,
|
|
177
|
+
trace_thread_id=thread_id,
|
|
178
|
+
trace_user_id=user_id,
|
|
179
|
+
trace_metric_collection=metric_collection,
|
|
180
|
+
trace_metrics=metrics,
|
|
181
|
+
trace_input=input,
|
|
182
|
+
trace_output=result.output,
|
|
183
|
+
)
|
|
116
184
|
|
|
117
|
-
@functools.wraps(original_init)
|
|
118
|
-
def wrapper(
|
|
119
|
-
self,
|
|
120
|
-
*args,
|
|
121
|
-
llm_metric_collection: Optional[str] = None,
|
|
122
|
-
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
123
|
-
llm_prompt: Optional[Prompt] = None,
|
|
124
|
-
agent_metric_collection: Optional[str] = None,
|
|
125
|
-
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
126
|
-
**kwargs
|
|
127
|
-
):
|
|
128
|
-
result = original_init(self, *args, **kwargs)
|
|
129
|
-
_patch_llm_model(
|
|
130
|
-
self._model, llm_metric_collection, llm_metrics, llm_prompt
|
|
131
|
-
) # runtime patch of the model
|
|
132
|
-
_patch_agent_run(agent_metric_collection, agent_metrics)
|
|
133
185
|
return result
|
|
134
186
|
|
|
135
|
-
|
|
187
|
+
agent.run_sync = wrapper
|
|
136
188
|
|
|
137
189
|
|
|
138
190
|
def _patch_agent_run(
|
|
191
|
+
agent: Agent,
|
|
139
192
|
agent_metric_collection: Optional[str] = None,
|
|
140
193
|
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
141
194
|
):
|
|
142
|
-
original_run =
|
|
195
|
+
original_run = agent.run
|
|
143
196
|
|
|
144
197
|
@functools.wraps(original_run)
|
|
145
198
|
async def wrapper(
|
|
146
199
|
*args,
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
200
|
+
metric_collection: Optional[str] = None,
|
|
201
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
202
|
+
name: Optional[str] = None,
|
|
203
|
+
tags: Optional[List[str]] = None,
|
|
204
|
+
metadata: Optional[dict] = None,
|
|
205
|
+
thread_id: Optional[str] = None,
|
|
206
|
+
user_id: Optional[str] = None,
|
|
154
207
|
**kwargs
|
|
155
208
|
):
|
|
209
|
+
sig = inspect.signature(original_run)
|
|
210
|
+
bound = sig.bind_partial(*args, **kwargs)
|
|
211
|
+
bound.apply_defaults()
|
|
212
|
+
input = bound.arguments.get("user_prompt", None)
|
|
213
|
+
|
|
214
|
+
in_sync = _IN_RUN_SYNC.get()
|
|
156
215
|
with Observer(
|
|
157
|
-
span_type="agent",
|
|
158
|
-
func_name="Agent",
|
|
159
|
-
function_kwargs={"input":
|
|
160
|
-
metrics=agent_metrics,
|
|
161
|
-
metric_collection=agent_metric_collection,
|
|
216
|
+
span_type="agent" if not in_sync else "custom",
|
|
217
|
+
func_name="Agent" if not in_sync else "run",
|
|
218
|
+
function_kwargs={"input": input},
|
|
219
|
+
metrics=agent_metrics if not in_sync else None,
|
|
220
|
+
metric_collection=agent_metric_collection if not in_sync else None,
|
|
162
221
|
) as observer:
|
|
163
222
|
result = await original_run(*args, **kwargs)
|
|
164
223
|
observer.update_span_properties = (
|
|
@@ -167,44 +226,20 @@ def _patch_agent_run(
|
|
|
167
226
|
observer.result = result.output
|
|
168
227
|
|
|
169
228
|
_update_trace_context(
|
|
170
|
-
trace_name=
|
|
171
|
-
trace_tags=
|
|
172
|
-
trace_metadata=
|
|
173
|
-
trace_thread_id=
|
|
174
|
-
trace_user_id=
|
|
175
|
-
trace_metric_collection=
|
|
176
|
-
trace_metrics=
|
|
177
|
-
trace_input=
|
|
229
|
+
trace_name=name,
|
|
230
|
+
trace_tags=tags,
|
|
231
|
+
trace_metadata=metadata,
|
|
232
|
+
trace_thread_id=thread_id,
|
|
233
|
+
trace_user_id=user_id,
|
|
234
|
+
trace_metric_collection=metric_collection,
|
|
235
|
+
trace_metrics=metrics,
|
|
236
|
+
trace_input=input,
|
|
178
237
|
trace_output=result.output,
|
|
179
238
|
)
|
|
180
239
|
|
|
181
240
|
return result
|
|
182
241
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def _update_trace_context(
|
|
187
|
-
trace_name: Optional[str] = None,
|
|
188
|
-
trace_tags: Optional[List[str]] = None,
|
|
189
|
-
trace_metadata: Optional[dict] = None,
|
|
190
|
-
trace_thread_id: Optional[str] = None,
|
|
191
|
-
trace_user_id: Optional[str] = None,
|
|
192
|
-
trace_metric_collection: Optional[str] = None,
|
|
193
|
-
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
194
|
-
trace_input: Optional[Any] = None,
|
|
195
|
-
trace_output: Optional[Any] = None,
|
|
196
|
-
):
|
|
197
|
-
|
|
198
|
-
current_trace = current_trace_context.get()
|
|
199
|
-
current_trace.name = trace_name
|
|
200
|
-
current_trace.tags = trace_tags
|
|
201
|
-
current_trace.metadata = trace_metadata
|
|
202
|
-
current_trace.thread_id = trace_thread_id
|
|
203
|
-
current_trace.user_id = trace_user_id
|
|
204
|
-
current_trace.metric_collection = trace_metric_collection
|
|
205
|
-
current_trace.metrics = trace_metrics
|
|
206
|
-
current_trace.input = trace_input
|
|
207
|
-
current_trace.output = trace_output
|
|
242
|
+
agent.run = wrapper
|
|
208
243
|
|
|
209
244
|
|
|
210
245
|
def _patch_llm_model(
|
|
@@ -214,6 +249,8 @@ def _patch_llm_model(
|
|
|
214
249
|
llm_prompt: Optional[Prompt] = None,
|
|
215
250
|
):
|
|
216
251
|
original_func = model.request
|
|
252
|
+
sig = inspect.signature(original_func)
|
|
253
|
+
|
|
217
254
|
try:
|
|
218
255
|
model_name = model.model_name
|
|
219
256
|
except Exception:
|
|
@@ -221,6 +258,10 @@ def _patch_llm_model(
|
|
|
221
258
|
|
|
222
259
|
@functools.wraps(original_func)
|
|
223
260
|
async def wrapper(*args, **kwargs):
|
|
261
|
+
bound = sig.bind_partial(*args, **kwargs)
|
|
262
|
+
bound.apply_defaults()
|
|
263
|
+
request = bound.arguments.get("messages", [])
|
|
264
|
+
|
|
224
265
|
with Observer(
|
|
225
266
|
span_type="llm",
|
|
226
267
|
func_name="LLM",
|
|
@@ -229,36 +270,93 @@ def _patch_llm_model(
|
|
|
229
270
|
metric_collection=llm_metric_collection,
|
|
230
271
|
) as observer:
|
|
231
272
|
result = await original_func(*args, **kwargs)
|
|
232
|
-
request = kwargs.get("messages", [])
|
|
233
|
-
if not request:
|
|
234
|
-
request = args[0]
|
|
235
273
|
observer.update_span_properties = (
|
|
236
274
|
lambda llm_span: set_llm_span_attributes(
|
|
237
|
-
llm_span,
|
|
275
|
+
llm_span, request, result, llm_prompt
|
|
238
276
|
)
|
|
239
277
|
)
|
|
240
278
|
observer.result = result
|
|
241
|
-
|
|
279
|
+
return result
|
|
242
280
|
|
|
243
281
|
model.request = wrapper
|
|
244
282
|
|
|
245
283
|
|
|
246
|
-
|
|
284
|
+
################### Helper Functions ###################
|
|
247
285
|
|
|
248
|
-
if api_key:
|
|
249
|
-
deepeval.login(api_key)
|
|
250
286
|
|
|
251
|
-
|
|
287
|
+
def _create_patched_tool(
|
|
288
|
+
func: Callable,
|
|
289
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
290
|
+
metric_collection: Optional[str] = None,
|
|
291
|
+
):
|
|
292
|
+
import asyncio
|
|
252
293
|
|
|
253
|
-
|
|
254
|
-
raise ValueError("No api key provided.")
|
|
294
|
+
original_func = func
|
|
255
295
|
|
|
256
|
-
|
|
257
|
-
|
|
296
|
+
is_async = asyncio.iscoroutinefunction(original_func)
|
|
297
|
+
|
|
298
|
+
if is_async:
|
|
299
|
+
|
|
300
|
+
@functools.wraps(original_func)
|
|
301
|
+
async def async_wrapper(*args, **kwargs):
|
|
302
|
+
sanitized_args = sanitize_run_context(args)
|
|
303
|
+
sanitized_kwargs = sanitize_run_context(kwargs)
|
|
304
|
+
with Observer(
|
|
305
|
+
span_type="tool",
|
|
306
|
+
func_name=original_func.__name__,
|
|
307
|
+
metrics=metrics,
|
|
308
|
+
metric_collection=metric_collection,
|
|
309
|
+
function_kwargs={"args": sanitized_args, **sanitized_kwargs},
|
|
310
|
+
) as observer:
|
|
311
|
+
result = await original_func(*args, **kwargs)
|
|
312
|
+
observer.result = result
|
|
313
|
+
|
|
314
|
+
return result
|
|
315
|
+
|
|
316
|
+
return async_wrapper
|
|
258
317
|
else:
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
318
|
+
|
|
319
|
+
@functools.wraps(original_func)
|
|
320
|
+
def sync_wrapper(*args, **kwargs):
|
|
321
|
+
sanitized_args = sanitize_run_context(args)
|
|
322
|
+
sanitized_kwargs = sanitize_run_context(kwargs)
|
|
323
|
+
with Observer(
|
|
324
|
+
span_type="tool",
|
|
325
|
+
func_name=original_func.__name__,
|
|
326
|
+
metrics=metrics,
|
|
327
|
+
metric_collection=metric_collection,
|
|
328
|
+
function_kwargs={"args": sanitized_args, **sanitized_kwargs},
|
|
329
|
+
) as observer:
|
|
330
|
+
result = original_func(*args, **kwargs)
|
|
331
|
+
observer.result = result
|
|
332
|
+
|
|
333
|
+
return result
|
|
334
|
+
|
|
335
|
+
return sync_wrapper
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _update_trace_context(
|
|
339
|
+
trace_name: Optional[str] = None,
|
|
340
|
+
trace_tags: Optional[List[str]] = None,
|
|
341
|
+
trace_metadata: Optional[dict] = None,
|
|
342
|
+
trace_thread_id: Optional[str] = None,
|
|
343
|
+
trace_user_id: Optional[str] = None,
|
|
344
|
+
trace_metric_collection: Optional[str] = None,
|
|
345
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
346
|
+
trace_input: Optional[Any] = None,
|
|
347
|
+
trace_output: Optional[Any] = None,
|
|
348
|
+
):
|
|
349
|
+
|
|
350
|
+
current_trace = current_trace_context.get()
|
|
351
|
+
current_trace.name = trace_name
|
|
352
|
+
current_trace.tags = trace_tags
|
|
353
|
+
current_trace.metadata = trace_metadata
|
|
354
|
+
current_trace.thread_id = trace_thread_id
|
|
355
|
+
current_trace.user_id = trace_user_id
|
|
356
|
+
current_trace.metric_collection = trace_metric_collection
|
|
357
|
+
current_trace.metrics = trace_metrics
|
|
358
|
+
current_trace.input = trace_input
|
|
359
|
+
current_trace.output = trace_output
|
|
262
360
|
|
|
263
361
|
|
|
264
362
|
def set_llm_span_attributes(
|
|
@@ -306,71 +404,8 @@ def set_llm_span_attributes(
|
|
|
306
404
|
llm_span.output = LlmOutput(
|
|
307
405
|
role="Assistant", content=content, tool_calls=tool_calls
|
|
308
406
|
)
|
|
309
|
-
llm_span.tools_called =
|
|
310
|
-
result.parts
|
|
311
|
-
)
|
|
407
|
+
llm_span.tools_called = extract_tools_called_from_llm_response(result.parts)
|
|
312
408
|
|
|
313
409
|
|
|
314
410
|
def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
|
|
315
|
-
agent_span.tools_called =
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
# llm tools called
|
|
319
|
-
def _extract_tools_called_from_llm_response(
|
|
320
|
-
result: List[ModelResponsePart],
|
|
321
|
-
) -> List[ToolCall]:
|
|
322
|
-
tool_calls = []
|
|
323
|
-
|
|
324
|
-
# Loop through each ModelResponsePart
|
|
325
|
-
for part in result:
|
|
326
|
-
# Look for parts with part_kind="tool-call"
|
|
327
|
-
if hasattr(part, "part_kind") and part.part_kind == "tool-call":
|
|
328
|
-
# Extract tool name and args from the ToolCallPart
|
|
329
|
-
tool_name = part.tool_name
|
|
330
|
-
input_parameters = (
|
|
331
|
-
part.args_as_dict() if hasattr(part, "args_as_dict") else None
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
# Create and append ToolCall object
|
|
335
|
-
tool_call = ToolCall(
|
|
336
|
-
name=tool_name, input_parameters=input_parameters
|
|
337
|
-
)
|
|
338
|
-
tool_calls.append(tool_call)
|
|
339
|
-
|
|
340
|
-
return tool_calls
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
# TODO: llm tools called (reposne is present next message)
|
|
344
|
-
def _extract_tools_called(result: AgentRunResult) -> List[ToolCall]:
|
|
345
|
-
tool_calls = []
|
|
346
|
-
|
|
347
|
-
# Access the message history from the _state
|
|
348
|
-
message_history = result._state.message_history
|
|
349
|
-
|
|
350
|
-
# Scan through all messages in the history
|
|
351
|
-
for message in message_history:
|
|
352
|
-
# Check if this is a ModelResponse (kind="response")
|
|
353
|
-
if hasattr(message, "kind") and message.kind == "response":
|
|
354
|
-
# For ModelResponse messages, check each part
|
|
355
|
-
if hasattr(message, "parts"):
|
|
356
|
-
for part in message.parts:
|
|
357
|
-
# Look for parts with part_kind="tool-call"
|
|
358
|
-
if (
|
|
359
|
-
hasattr(part, "part_kind")
|
|
360
|
-
and part.part_kind == "tool-call"
|
|
361
|
-
):
|
|
362
|
-
# Extract tool name and args from the ToolCallPart
|
|
363
|
-
tool_name = part.tool_name
|
|
364
|
-
input_parameters = (
|
|
365
|
-
part.args_as_dict()
|
|
366
|
-
if hasattr(part, "args_as_dict")
|
|
367
|
-
else None
|
|
368
|
-
)
|
|
369
|
-
|
|
370
|
-
# Create and append ToolCall object
|
|
371
|
-
tool_call = ToolCall(
|
|
372
|
-
name=tool_name, input_parameters=input_parameters
|
|
373
|
-
)
|
|
374
|
-
tool_calls.append(tool_call)
|
|
375
|
-
|
|
376
|
-
return tool_calls
|
|
411
|
+
agent_span.tools_called = extract_tools_called(result)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from pydantic_ai.messages import ModelResponsePart
|
|
3
|
+
from pydantic_ai.agent import AgentRunResult
|
|
4
|
+
from pydantic_ai._run_context import RunContext
|
|
5
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# llm tools called
|
|
9
|
+
def extract_tools_called_from_llm_response(
|
|
10
|
+
result: List[ModelResponsePart],
|
|
11
|
+
) -> List[ToolCall]:
|
|
12
|
+
tool_calls = []
|
|
13
|
+
|
|
14
|
+
# Loop through each ModelResponsePart
|
|
15
|
+
for part in result:
|
|
16
|
+
# Look for parts with part_kind="tool-call"
|
|
17
|
+
if hasattr(part, "part_kind") and part.part_kind == "tool-call":
|
|
18
|
+
# Extract tool name and args from the ToolCallPart
|
|
19
|
+
tool_name = part.tool_name
|
|
20
|
+
input_parameters = (
|
|
21
|
+
part.args_as_dict() if hasattr(part, "args_as_dict") else None
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Create and append ToolCall object
|
|
25
|
+
tool_call = ToolCall(
|
|
26
|
+
name=tool_name, input_parameters=input_parameters
|
|
27
|
+
)
|
|
28
|
+
tool_calls.append(tool_call)
|
|
29
|
+
|
|
30
|
+
return tool_calls
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# TODO: llm tools called (reposne is present next message)
|
|
34
|
+
def extract_tools_called(result: AgentRunResult) -> List[ToolCall]:
|
|
35
|
+
tool_calls = []
|
|
36
|
+
|
|
37
|
+
# Access the message history from the _state
|
|
38
|
+
message_history = result._state.message_history
|
|
39
|
+
|
|
40
|
+
# Scan through all messages in the history
|
|
41
|
+
for message in message_history:
|
|
42
|
+
# Check if this is a ModelResponse (kind="response")
|
|
43
|
+
if hasattr(message, "kind") and message.kind == "response":
|
|
44
|
+
# For ModelResponse messages, check each part
|
|
45
|
+
if hasattr(message, "parts"):
|
|
46
|
+
for part in message.parts:
|
|
47
|
+
# Look for parts with part_kind="tool-call"
|
|
48
|
+
if (
|
|
49
|
+
hasattr(part, "part_kind")
|
|
50
|
+
and part.part_kind == "tool-call"
|
|
51
|
+
):
|
|
52
|
+
# Extract tool name and args from the ToolCallPart
|
|
53
|
+
tool_name = part.tool_name
|
|
54
|
+
input_parameters = (
|
|
55
|
+
part.args_as_dict()
|
|
56
|
+
if hasattr(part, "args_as_dict")
|
|
57
|
+
else None
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Create and append ToolCall object
|
|
61
|
+
tool_call = ToolCall(
|
|
62
|
+
name=tool_name, input_parameters=input_parameters
|
|
63
|
+
)
|
|
64
|
+
tool_calls.append(tool_call)
|
|
65
|
+
|
|
66
|
+
return tool_calls
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def sanitize_run_context(value):
|
|
70
|
+
"""
|
|
71
|
+
Recursively replace pydantic-ai RunContext instances with '<RunContext>'.
|
|
72
|
+
|
|
73
|
+
This avoids leaking internal context details into recorded function kwargs,
|
|
74
|
+
while keeping the original arguments intact for the actual function call.
|
|
75
|
+
"""
|
|
76
|
+
if isinstance(value, RunContext):
|
|
77
|
+
return "<RunContext>"
|
|
78
|
+
if isinstance(value, dict):
|
|
79
|
+
return {k: sanitize_run_context(v) for k, v in value.items()}
|
|
80
|
+
if isinstance(value, (list, tuple)):
|
|
81
|
+
sanitized = [sanitize_run_context(v) for v in value]
|
|
82
|
+
return tuple(sanitized) if isinstance(value, tuple) else sanitized
|
|
83
|
+
if isinstance(value, set):
|
|
84
|
+
return {sanitize_run_context(v) for v in value}
|
|
85
|
+
|
|
86
|
+
return value
|
|
@@ -316,6 +316,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
316
316
|
else:
|
|
317
317
|
prompt = ConversationalGEvalTemplate.generate_evaluation_results(
|
|
318
318
|
evaluation_steps=self.number_evaluation_steps(),
|
|
319
|
+
test_case_content=test_case_content,
|
|
319
320
|
turns=[
|
|
320
321
|
convert_turn_to_dict(turn, self.evaluation_params)
|
|
321
322
|
for turn in test_case.turns
|
|
@@ -284,7 +284,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
284
284
|
no_privacy_count += 1
|
|
285
285
|
|
|
286
286
|
score = no_privacy_count / number_of_verdicts
|
|
287
|
-
return
|
|
287
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
288
288
|
|
|
289
289
|
def is_successful(self) -> bool:
|
|
290
290
|
if self.error is not None:
|