deepeval 3.4.9__py3-none-any.whl → 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/drop/drop.py +2 -3
- deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
- deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
- deepeval/benchmarks/math_qa/math_qa.py +2 -2
- deepeval/benchmarks/mmlu/mmlu.py +2 -2
- deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
- deepeval/confident/api.py +1 -0
- deepeval/integrations/pydantic_ai/__init__.py +2 -4
- deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
- deepeval/integrations/pydantic_ai/patcher.py +376 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/kimi_model.py +1 -1
- deepeval/tracing/perf_epoch_bridge.py +4 -4
- deepeval/utils.py +2 -2
- {deepeval-3.4.9.dist-info → deepeval-3.5.0.dist-info}/METADATA +14 -13
- {deepeval-3.4.9.dist-info → deepeval-3.5.0.dist-info}/RECORD +20 -20
- deepeval/integrations/pydantic_ai/agent.py +0 -364
- {deepeval-3.4.9.dist-info → deepeval-3.5.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.9.dist-info → deepeval-3.5.0.dist-info}/WHEEL +0 -0
- {deepeval-3.4.9.dist-info → deepeval-3.5.0.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.
|
|
1
|
+
__version__: str = "3.5.0"
|
deepeval/benchmarks/drop/drop.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
|
-
from typing import Union
|
|
4
3
|
|
|
5
4
|
from deepeval.dataset import Golden
|
|
6
5
|
from deepeval.benchmarks.base_benchmark import (
|
|
@@ -50,7 +49,7 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
50
49
|
self,
|
|
51
50
|
model: DeepEvalBaseLLM,
|
|
52
51
|
*args,
|
|
53
|
-
batch_size: int
|
|
52
|
+
batch_size: Union[int, None] = None,
|
|
54
53
|
**kwargs,
|
|
55
54
|
) -> DeepEvalBaseBenchmarkResult:
|
|
56
55
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Dict, Optional
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -51,7 +51,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
|
|
|
51
51
|
self,
|
|
52
52
|
model: DeepEvalBaseLLM,
|
|
53
53
|
*args,
|
|
54
|
-
batch_size: int
|
|
54
|
+
batch_size: Union[int, None] = None,
|
|
55
55
|
**kwargs,
|
|
56
56
|
) -> DeepEvalBaseBenchmarkResult:
|
|
57
57
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
import requests
|
|
4
4
|
import json
|
|
@@ -52,7 +52,7 @@ class LogiQA(DeepEvalBaseBenchmark):
|
|
|
52
52
|
self,
|
|
53
53
|
model: DeepEvalBaseLLM,
|
|
54
54
|
*args,
|
|
55
|
-
batch_size: int
|
|
55
|
+
batch_size: Union[int, None] = None,
|
|
56
56
|
**kwargs,
|
|
57
57
|
) -> DeepEvalBaseBenchmarkResult:
|
|
58
58
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -50,7 +50,7 @@ class MathQA(DeepEvalBaseBenchmark):
|
|
|
50
50
|
self,
|
|
51
51
|
model: DeepEvalBaseLLM,
|
|
52
52
|
*args,
|
|
53
|
-
batch_size: int
|
|
53
|
+
batch_size: Union[int, None] = None,
|
|
54
54
|
**kwargs,
|
|
55
55
|
) -> DeepEvalBaseBenchmarkResult:
|
|
56
56
|
import pandas as pd
|
deepeval/benchmarks/mmlu/mmlu.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -49,7 +49,7 @@ class MMLU(DeepEvalBaseBenchmark):
|
|
|
49
49
|
self,
|
|
50
50
|
model: DeepEvalBaseLLM,
|
|
51
51
|
*args,
|
|
52
|
-
batch_size: int
|
|
52
|
+
batch_size: Union[int, None] = None,
|
|
53
53
|
**kwargs,
|
|
54
54
|
) -> DeepEvalBaseBenchmarkResult:
|
|
55
55
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Dict, Optional
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -59,7 +59,7 @@ class TruthfulQA(DeepEvalBaseBenchmark):
|
|
|
59
59
|
self,
|
|
60
60
|
model: DeepEvalBaseLLM,
|
|
61
61
|
*args,
|
|
62
|
-
batch_size: int
|
|
62
|
+
batch_size: Union[int, None] = None,
|
|
63
63
|
**kwargs,
|
|
64
64
|
) -> DeepEvalBaseBenchmarkResult:
|
|
65
65
|
import pandas as pd
|
deepeval/confident/api.py
CHANGED
|
@@ -31,14 +31,6 @@ def instrument_pydantic_ai(api_key: Optional[str] = None):
|
|
|
31
31
|
with capture_tracing_integration("pydantic_ai"):
|
|
32
32
|
is_opentelemetry_available()
|
|
33
33
|
|
|
34
|
-
if api_key:
|
|
35
|
-
deepeval.login(api_key)
|
|
36
|
-
|
|
37
|
-
api_key = get_confident_api_key()
|
|
38
|
-
|
|
39
|
-
if not api_key:
|
|
40
|
-
raise ValueError("No api key provided.")
|
|
41
|
-
|
|
42
34
|
# create a new tracer provider
|
|
43
35
|
tracer_provider = TracerProvider()
|
|
44
36
|
tracer_provider.add_span_processor(
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import deepeval
|
|
3
|
+
from deepeval.tracing.types import LlmOutput, LlmToolCall
|
|
4
|
+
from pydantic_ai.agent import AgentRunResult
|
|
5
|
+
from deepeval.tracing.context import current_trace_context
|
|
6
|
+
from deepeval.tracing.types import AgentSpan, LlmSpan
|
|
7
|
+
from deepeval.tracing.tracing import Observer
|
|
8
|
+
from typing import List, Callable, Optional, Any
|
|
9
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
10
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
11
|
+
from deepeval.confident.api import get_confident_api_key
|
|
12
|
+
from deepeval.integrations.pydantic_ai.otel import instrument_pydantic_ai
|
|
13
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
14
|
+
from deepeval.prompt import Prompt
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from pydantic_ai.agent import Agent
|
|
18
|
+
from pydantic_ai.models import Model
|
|
19
|
+
from pydantic_ai.messages import (
|
|
20
|
+
ModelResponse,
|
|
21
|
+
ModelRequest,
|
|
22
|
+
ModelResponsePart,
|
|
23
|
+
TextPart,
|
|
24
|
+
ToolCallPart,
|
|
25
|
+
SystemPromptPart,
|
|
26
|
+
ToolReturnPart,
|
|
27
|
+
UserPromptPart,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
pydantic_ai_installed = True
|
|
31
|
+
except:
|
|
32
|
+
pydantic_ai_installed = True
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _patch_agent_tool_decorator():
|
|
36
|
+
original_tool = Agent.tool
|
|
37
|
+
|
|
38
|
+
@functools.wraps(original_tool)
|
|
39
|
+
def wrapper(
|
|
40
|
+
*args,
|
|
41
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
42
|
+
metric_collection: Optional[str] = None,
|
|
43
|
+
**kwargs
|
|
44
|
+
):
|
|
45
|
+
# Case 1: Direct decoration - @agent.tool
|
|
46
|
+
if args and callable(args[0]):
|
|
47
|
+
patched_func = _create_patched_tool(
|
|
48
|
+
args[0], metrics, metric_collection
|
|
49
|
+
)
|
|
50
|
+
new_args = (patched_func,) + args[1:]
|
|
51
|
+
return original_tool(*new_args, **kwargs)
|
|
52
|
+
|
|
53
|
+
# Case 2: Decoration with arguments - @agent.tool(metrics=..., metric_collection=...)
|
|
54
|
+
else:
|
|
55
|
+
# Return a decorator function that will receive the actual function
|
|
56
|
+
def decorator(func):
|
|
57
|
+
patched_func = _create_patched_tool(
|
|
58
|
+
func, metrics, metric_collection
|
|
59
|
+
)
|
|
60
|
+
return original_tool(*args, **kwargs)(patched_func)
|
|
61
|
+
|
|
62
|
+
return decorator
|
|
63
|
+
|
|
64
|
+
Agent.tool = wrapper
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _create_patched_tool(
|
|
68
|
+
func: Callable,
|
|
69
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
70
|
+
metric_collection: Optional[str] = None,
|
|
71
|
+
):
|
|
72
|
+
import asyncio
|
|
73
|
+
|
|
74
|
+
original_func = func
|
|
75
|
+
|
|
76
|
+
is_async = asyncio.iscoroutinefunction(original_func)
|
|
77
|
+
|
|
78
|
+
if is_async:
|
|
79
|
+
|
|
80
|
+
@functools.wraps(original_func)
|
|
81
|
+
async def async_wrapper(*args, **kwargs):
|
|
82
|
+
with Observer(
|
|
83
|
+
span_type="tool",
|
|
84
|
+
func_name=original_func.__name__,
|
|
85
|
+
metrics=metrics,
|
|
86
|
+
metric_collection=metric_collection,
|
|
87
|
+
function_kwargs={"args": args, **kwargs},
|
|
88
|
+
) as observer:
|
|
89
|
+
result = await original_func(*args, **kwargs)
|
|
90
|
+
observer.result = result
|
|
91
|
+
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
return async_wrapper
|
|
95
|
+
else:
|
|
96
|
+
|
|
97
|
+
@functools.wraps(original_func)
|
|
98
|
+
def sync_wrapper(*args, **kwargs):
|
|
99
|
+
with Observer(
|
|
100
|
+
span_type="tool",
|
|
101
|
+
func_name=original_func.__name__,
|
|
102
|
+
metrics=metrics,
|
|
103
|
+
metric_collection=metric_collection,
|
|
104
|
+
function_kwargs={"args": args, **kwargs},
|
|
105
|
+
) as observer:
|
|
106
|
+
result = original_func(*args, **kwargs)
|
|
107
|
+
observer.result = result
|
|
108
|
+
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
return sync_wrapper
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _patch_agent_init():
|
|
115
|
+
original_init = Agent.__init__
|
|
116
|
+
|
|
117
|
+
@functools.wraps(original_init)
|
|
118
|
+
def wrapper(
|
|
119
|
+
self,
|
|
120
|
+
*args,
|
|
121
|
+
llm_metric_collection: Optional[str] = None,
|
|
122
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
123
|
+
llm_prompt: Optional[Prompt] = None,
|
|
124
|
+
agent_metric_collection: Optional[str] = None,
|
|
125
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
126
|
+
**kwargs
|
|
127
|
+
):
|
|
128
|
+
result = original_init(self, *args, **kwargs)
|
|
129
|
+
_patch_llm_model(
|
|
130
|
+
self._model, llm_metric_collection, llm_metrics, llm_prompt
|
|
131
|
+
) # runtime patch of the model
|
|
132
|
+
_patch_agent_run(agent_metric_collection, agent_metrics)
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
Agent.__init__ = wrapper
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _patch_agent_run(
|
|
139
|
+
agent_metric_collection: Optional[str] = None,
|
|
140
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
141
|
+
):
|
|
142
|
+
original_run = Agent.run
|
|
143
|
+
|
|
144
|
+
@functools.wraps(original_run)
|
|
145
|
+
async def wrapper(
|
|
146
|
+
*args,
|
|
147
|
+
trace_metric_collection: Optional[str] = None,
|
|
148
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
149
|
+
trace_name: Optional[str] = None,
|
|
150
|
+
trace_tags: Optional[List[str]] = None,
|
|
151
|
+
trace_metadata: Optional[dict] = None,
|
|
152
|
+
trace_thread_id: Optional[str] = None,
|
|
153
|
+
trace_user_id: Optional[str] = None,
|
|
154
|
+
**kwargs
|
|
155
|
+
):
|
|
156
|
+
with Observer(
|
|
157
|
+
span_type="agent",
|
|
158
|
+
func_name="Agent",
|
|
159
|
+
function_kwargs={"input": args[1]},
|
|
160
|
+
metrics=agent_metrics,
|
|
161
|
+
metric_collection=agent_metric_collection,
|
|
162
|
+
) as observer:
|
|
163
|
+
result = await original_run(*args, **kwargs)
|
|
164
|
+
observer.update_span_properties = (
|
|
165
|
+
lambda agent_span: set_agent_span_attributes(agent_span, result)
|
|
166
|
+
)
|
|
167
|
+
observer.result = result.output
|
|
168
|
+
|
|
169
|
+
_update_trace_context(
|
|
170
|
+
trace_name=trace_name,
|
|
171
|
+
trace_tags=trace_tags,
|
|
172
|
+
trace_metadata=trace_metadata,
|
|
173
|
+
trace_thread_id=trace_thread_id,
|
|
174
|
+
trace_user_id=trace_user_id,
|
|
175
|
+
trace_metric_collection=trace_metric_collection,
|
|
176
|
+
trace_metrics=trace_metrics,
|
|
177
|
+
trace_input=args[1],
|
|
178
|
+
trace_output=result.output,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
Agent.run = wrapper
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _update_trace_context(
|
|
187
|
+
trace_name: Optional[str] = None,
|
|
188
|
+
trace_tags: Optional[List[str]] = None,
|
|
189
|
+
trace_metadata: Optional[dict] = None,
|
|
190
|
+
trace_thread_id: Optional[str] = None,
|
|
191
|
+
trace_user_id: Optional[str] = None,
|
|
192
|
+
trace_metric_collection: Optional[str] = None,
|
|
193
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
194
|
+
trace_input: Optional[Any] = None,
|
|
195
|
+
trace_output: Optional[Any] = None,
|
|
196
|
+
):
|
|
197
|
+
|
|
198
|
+
current_trace = current_trace_context.get()
|
|
199
|
+
current_trace.name = trace_name
|
|
200
|
+
current_trace.tags = trace_tags
|
|
201
|
+
current_trace.metadata = trace_metadata
|
|
202
|
+
current_trace.thread_id = trace_thread_id
|
|
203
|
+
current_trace.user_id = trace_user_id
|
|
204
|
+
current_trace.metric_collection = trace_metric_collection
|
|
205
|
+
current_trace.metrics = trace_metrics
|
|
206
|
+
current_trace.input = trace_input
|
|
207
|
+
current_trace.output = trace_output
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _patch_llm_model(
|
|
211
|
+
model: Model,
|
|
212
|
+
llm_metric_collection: Optional[str] = None,
|
|
213
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
214
|
+
llm_prompt: Optional[Prompt] = None,
|
|
215
|
+
):
|
|
216
|
+
original_func = model.request
|
|
217
|
+
try:
|
|
218
|
+
model_name = model.model_name
|
|
219
|
+
except Exception:
|
|
220
|
+
model_name = "unknown"
|
|
221
|
+
|
|
222
|
+
@functools.wraps(original_func)
|
|
223
|
+
async def wrapper(*args, **kwargs):
|
|
224
|
+
with Observer(
|
|
225
|
+
span_type="llm",
|
|
226
|
+
func_name="LLM",
|
|
227
|
+
observe_kwargs={"model": model_name},
|
|
228
|
+
metrics=llm_metrics,
|
|
229
|
+
metric_collection=llm_metric_collection,
|
|
230
|
+
) as observer:
|
|
231
|
+
result = await original_func(*args, **kwargs)
|
|
232
|
+
request = kwargs.get("messages", [])
|
|
233
|
+
if not request:
|
|
234
|
+
request = args[0]
|
|
235
|
+
observer.update_span_properties = (
|
|
236
|
+
lambda llm_span: set_llm_span_attributes(
|
|
237
|
+
llm_span, args[0], result, llm_prompt
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
observer.result = result
|
|
241
|
+
return result
|
|
242
|
+
|
|
243
|
+
model.request = wrapper
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def instrument(otel: Optional[bool] = False, api_key: Optional[str] = None):
|
|
247
|
+
|
|
248
|
+
if api_key:
|
|
249
|
+
deepeval.login(api_key)
|
|
250
|
+
|
|
251
|
+
api_key = get_confident_api_key()
|
|
252
|
+
|
|
253
|
+
if not api_key:
|
|
254
|
+
raise ValueError("No api key provided.")
|
|
255
|
+
|
|
256
|
+
if otel:
|
|
257
|
+
instrument_pydantic_ai(api_key)
|
|
258
|
+
else:
|
|
259
|
+
with capture_tracing_integration("pydantic_ai"):
|
|
260
|
+
_patch_agent_init()
|
|
261
|
+
_patch_agent_tool_decorator()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def set_llm_span_attributes(
|
|
265
|
+
llm_span: LlmSpan,
|
|
266
|
+
requests: List[ModelRequest],
|
|
267
|
+
result: ModelResponse,
|
|
268
|
+
llm_prompt: Optional[Prompt] = None,
|
|
269
|
+
):
|
|
270
|
+
llm_span.prompt = llm_prompt
|
|
271
|
+
|
|
272
|
+
input = []
|
|
273
|
+
for request in requests:
|
|
274
|
+
for part in request.parts:
|
|
275
|
+
if isinstance(part, SystemPromptPart):
|
|
276
|
+
input.append({"role": "System", "content": part.content})
|
|
277
|
+
elif isinstance(part, UserPromptPart):
|
|
278
|
+
input.append({"role": "User", "content": part.content})
|
|
279
|
+
elif isinstance(part, ToolCallPart):
|
|
280
|
+
input.append(
|
|
281
|
+
{
|
|
282
|
+
"role": "Tool Call",
|
|
283
|
+
"name": part.tool_name,
|
|
284
|
+
"content": part.args_as_json_str(),
|
|
285
|
+
}
|
|
286
|
+
)
|
|
287
|
+
elif isinstance(part, ToolReturnPart):
|
|
288
|
+
input.append(
|
|
289
|
+
{
|
|
290
|
+
"role": "Tool Return",
|
|
291
|
+
"name": part.tool_name,
|
|
292
|
+
"content": part.model_response_str(),
|
|
293
|
+
}
|
|
294
|
+
)
|
|
295
|
+
llm_span.input = input
|
|
296
|
+
|
|
297
|
+
content = ""
|
|
298
|
+
tool_calls = []
|
|
299
|
+
for part in result.parts:
|
|
300
|
+
if isinstance(part, TextPart):
|
|
301
|
+
content += part.content + "\n"
|
|
302
|
+
elif isinstance(part, ToolCallPart):
|
|
303
|
+
tool_calls.append(
|
|
304
|
+
LlmToolCall(name=part.tool_name, args=part.args_as_dict())
|
|
305
|
+
)
|
|
306
|
+
llm_span.output = LlmOutput(
|
|
307
|
+
role="Assistant", content=content, tool_calls=tool_calls
|
|
308
|
+
)
|
|
309
|
+
llm_span.tools_called = _extract_tools_called_from_llm_response(
|
|
310
|
+
result.parts
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
|
|
315
|
+
agent_span.tools_called = _extract_tools_called(result)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# llm tools called
|
|
319
|
+
def _extract_tools_called_from_llm_response(
|
|
320
|
+
result: List[ModelResponsePart],
|
|
321
|
+
) -> List[ToolCall]:
|
|
322
|
+
tool_calls = []
|
|
323
|
+
|
|
324
|
+
# Loop through each ModelResponsePart
|
|
325
|
+
for part in result:
|
|
326
|
+
# Look for parts with part_kind="tool-call"
|
|
327
|
+
if hasattr(part, "part_kind") and part.part_kind == "tool-call":
|
|
328
|
+
# Extract tool name and args from the ToolCallPart
|
|
329
|
+
tool_name = part.tool_name
|
|
330
|
+
input_parameters = (
|
|
331
|
+
part.args_as_dict() if hasattr(part, "args_as_dict") else None
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Create and append ToolCall object
|
|
335
|
+
tool_call = ToolCall(
|
|
336
|
+
name=tool_name, input_parameters=input_parameters
|
|
337
|
+
)
|
|
338
|
+
tool_calls.append(tool_call)
|
|
339
|
+
|
|
340
|
+
return tool_calls
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# TODO: llm tools called (reposne is present next message)
|
|
344
|
+
def _extract_tools_called(result: AgentRunResult) -> List[ToolCall]:
|
|
345
|
+
tool_calls = []
|
|
346
|
+
|
|
347
|
+
# Access the message history from the _state
|
|
348
|
+
message_history = result._state.message_history
|
|
349
|
+
|
|
350
|
+
# Scan through all messages in the history
|
|
351
|
+
for message in message_history:
|
|
352
|
+
# Check if this is a ModelResponse (kind="response")
|
|
353
|
+
if hasattr(message, "kind") and message.kind == "response":
|
|
354
|
+
# For ModelResponse messages, check each part
|
|
355
|
+
if hasattr(message, "parts"):
|
|
356
|
+
for part in message.parts:
|
|
357
|
+
# Look for parts with part_kind="tool-call"
|
|
358
|
+
if (
|
|
359
|
+
hasattr(part, "part_kind")
|
|
360
|
+
and part.part_kind == "tool-call"
|
|
361
|
+
):
|
|
362
|
+
# Extract tool name and args from the ToolCallPart
|
|
363
|
+
tool_name = part.tool_name
|
|
364
|
+
input_parameters = (
|
|
365
|
+
part.args_as_dict()
|
|
366
|
+
if hasattr(part, "args_as_dict")
|
|
367
|
+
else None
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Create and append ToolCall object
|
|
371
|
+
tool_call = ToolCall(
|
|
372
|
+
name=tool_name, input_parameters=input_parameters
|
|
373
|
+
)
|
|
374
|
+
tool_calls.append(tool_call)
|
|
375
|
+
|
|
376
|
+
return tool_calls
|
|
@@ -15,12 +15,12 @@ Usage:
|
|
|
15
15
|
|
|
16
16
|
from __future__ import annotations
|
|
17
17
|
import time
|
|
18
|
-
from typing import Final
|
|
18
|
+
from typing import Final, Union
|
|
19
19
|
|
|
20
20
|
# Module globals are initialised exactly once.
|
|
21
|
-
_anchor_perf_ns: int
|
|
22
|
-
_anchor_wall_ns: int
|
|
23
|
-
_offset_ns: int
|
|
21
|
+
_anchor_perf_ns: Union[int, None] = None
|
|
22
|
+
_anchor_wall_ns: Union[int, None] = None
|
|
23
|
+
_offset_ns: Union[int, None] = None
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def init_clock_bridge() -> None:
|
deepeval/utils.py
CHANGED
|
@@ -516,7 +516,7 @@ def remove_pbars(
|
|
|
516
516
|
|
|
517
517
|
|
|
518
518
|
def read_env_int(
|
|
519
|
-
name: str, default: int, *, min_value: int
|
|
519
|
+
name: str, default: int, *, min_value: Union[int, None] = None
|
|
520
520
|
) -> int:
|
|
521
521
|
"""Read an integer from an environment variable with safe fallback.
|
|
522
522
|
|
|
@@ -545,7 +545,7 @@ def read_env_int(
|
|
|
545
545
|
|
|
546
546
|
|
|
547
547
|
def read_env_float(
|
|
548
|
-
name: str, default: float, *, min_value: float
|
|
548
|
+
name: str, default: float, *, min_value: Union[float, None] = None
|
|
549
549
|
) -> float:
|
|
550
550
|
"""Read a float from an environment variable with safe fallback.
|
|
551
551
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -189,16 +189,6 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
|
|
|
189
189
|
```
|
|
190
190
|
pip install -U deepeval
|
|
191
191
|
```
|
|
192
|
-
### Environment variables (.env / .env.local)
|
|
193
|
-
|
|
194
|
-
DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
|
|
195
|
-
**Precedence:** process env -> `.env.local` -> `.env`.
|
|
196
|
-
Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
|
|
197
|
-
|
|
198
|
-
```bash
|
|
199
|
-
cp .env.example .env.local
|
|
200
|
-
# then edit .env.local (ignored by git)
|
|
201
|
-
```
|
|
202
192
|
|
|
203
193
|
## Create an account (highly recommended)
|
|
204
194
|
|
|
@@ -391,9 +381,20 @@ evaluate(dataset, [answer_relevancy_metric])
|
|
|
391
381
|
dataset.evaluate([answer_relevancy_metric])
|
|
392
382
|
```
|
|
393
383
|
|
|
394
|
-
|
|
384
|
+
## A Note on Env Variables (.env / .env.local)
|
|
385
|
+
|
|
386
|
+
DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
|
|
387
|
+
**Precedence:** process env -> `.env.local` -> `.env`.
|
|
388
|
+
Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
|
|
389
|
+
|
|
390
|
+
```bash
|
|
391
|
+
cp .env.example .env.local
|
|
392
|
+
# then edit .env.local (ignored by git)
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
# DeepEval With Confident AI
|
|
395
396
|
|
|
396
|
-
|
|
397
|
+
DeepEval's cloud platform, [Confident AI](https://confident-ai.com?utm_source=Github), allows you to:
|
|
397
398
|
|
|
398
399
|
1. Curate/annotate evaluation datasets on the cloud
|
|
399
400
|
2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
|
|
2
|
-
deepeval/_version.py,sha256=
|
|
2
|
+
deepeval/_version.py,sha256=xgoMNdDXsY3c4GfV1_DVK-xGdMOp5KCDaKln5j0PJdY,27
|
|
3
3
|
deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
|
|
4
4
|
deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
|
|
5
5
|
deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
|
|
@@ -77,7 +77,7 @@ deepeval/benchmarks/bool_q/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
|
|
|
77
77
|
deepeval/benchmarks/bool_q/bool_q.py,sha256=wJM4-wSybT8EwgDJVB4p3QYXGNzLD3tdrpGE1cNEz_E,5507
|
|
78
78
|
deepeval/benchmarks/bool_q/template.py,sha256=pgNj4RR6-4VJDDySwnKt-MpghBCjVlZ7fPKY6PltllQ,4055
|
|
79
79
|
deepeval/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
80
|
-
deepeval/benchmarks/drop/drop.py,sha256=
|
|
80
|
+
deepeval/benchmarks/drop/drop.py,sha256=rGcqd79-IfQ2tvPuAL6wrON4R0hBiVGBy1OtDRmertE,12042
|
|
81
81
|
deepeval/benchmarks/drop/task.py,sha256=RV7DEXF192IOsY-yIVdlGb_y-A_sS5APPn8PGOPn5yU,17950
|
|
82
82
|
deepeval/benchmarks/drop/template.py,sha256=1P0mx_71Bxr9juIA8nGpVRIrP8NSoDILkIicjWvqE94,1376
|
|
83
83
|
deepeval/benchmarks/equity_med_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -88,7 +88,7 @@ deepeval/benchmarks/gsm8k/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
88
88
|
deepeval/benchmarks/gsm8k/gsm8k.py,sha256=LyJQBskKuqp013LLO3539RJiRXqCmlGl12BPXvQ8P88,6974
|
|
89
89
|
deepeval/benchmarks/gsm8k/template.py,sha256=3F7DwQwhJwKxtlbaO6TNvBBRaDEUBEp58JwirSjxtR0,1626
|
|
90
90
|
deepeval/benchmarks/hellaswag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
|
-
deepeval/benchmarks/hellaswag/hellaswag.py,sha256=
|
|
91
|
+
deepeval/benchmarks/hellaswag/hellaswag.py,sha256=_3felzBwQUhhRXk4D9NbcY8dme_qUQcwUjKGw9OtDJg,11972
|
|
92
92
|
deepeval/benchmarks/hellaswag/task.py,sha256=LfO8T6bpNiwdM8VdubKrup7qje3-rHgu69iB6Sdsc6I,7323
|
|
93
93
|
deepeval/benchmarks/hellaswag/template.py,sha256=TcCu25hkl89qbRwcEyRVGTGp7DU_5Eph754W2znk5QY,1279
|
|
94
94
|
deepeval/benchmarks/human_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -102,15 +102,15 @@ deepeval/benchmarks/lambada/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
|
102
102
|
deepeval/benchmarks/lambada/lambada.py,sha256=FExZLpDBgQfYe9o-MBS0LEy0-i4jHGeFHo8XCbMW_io,5556
|
|
103
103
|
deepeval/benchmarks/lambada/template.py,sha256=mSn0Elvp34wTnvaAm3IENz0mfGSNM_iRx50hIouk4t0,3776
|
|
104
104
|
deepeval/benchmarks/logi_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
105
|
-
deepeval/benchmarks/logi_qa/logi_qa.py,sha256=
|
|
105
|
+
deepeval/benchmarks/logi_qa/logi_qa.py,sha256=VNZGASigEDlJjzwGZtWG3OUs3v3P733GD84-h3TaxjU,11143
|
|
106
106
|
deepeval/benchmarks/logi_qa/task.py,sha256=pVMLVHPyDFSyoIsnckBNRDt8FK0J317PiGT-0dpr7rs,350
|
|
107
107
|
deepeval/benchmarks/logi_qa/template.py,sha256=EddGd2s3u2bPejogTcM50SDS7ynHnMhHaKuqQjjZoLk,4354
|
|
108
108
|
deepeval/benchmarks/math_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
109
|
-
deepeval/benchmarks/math_qa/math_qa.py,sha256=
|
|
109
|
+
deepeval/benchmarks/math_qa/math_qa.py,sha256=_eP-yocJom9r91qmAUBbIH4hrWazEHLV2lDKu0yMfEI,10651
|
|
110
110
|
deepeval/benchmarks/math_qa/task.py,sha256=3q_jlK5kIl5Zs0mQwuzxyvmPP6ncLZwszn7gtl1GfZs,192
|
|
111
111
|
deepeval/benchmarks/math_qa/template.py,sha256=pC3PB2GGU5TQ81I7E76RJh0xlu7xiF6d4SK3T_Nksh8,4468
|
|
112
112
|
deepeval/benchmarks/mmlu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
|
-
deepeval/benchmarks/mmlu/mmlu.py,sha256=
|
|
113
|
+
deepeval/benchmarks/mmlu/mmlu.py,sha256=flg3tb052DVo7wnfAHkW9n07tEEhHrkT2C0d5-UMBoQ,11431
|
|
114
114
|
deepeval/benchmarks/mmlu/task.py,sha256=HnhnuD4Xjur9GlrBtswaR7ZPouGx4NTgbcFZu_oIzXw,2580
|
|
115
115
|
deepeval/benchmarks/mmlu/template.py,sha256=MsdcrZWVkyZpEw--Kj6W7vjOJgig-ABiz9B3WtZz1MQ,1303
|
|
116
116
|
deepeval/benchmarks/modes/__init__.py,sha256=IGhZp0-nmvVsZWBnTuBvKhdGiy4TJZShFSjYAeBZdbo,135
|
|
@@ -125,7 +125,7 @@ deepeval/benchmarks/truthful_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
125
125
|
deepeval/benchmarks/truthful_qa/mode.py,sha256=66aXCTXGTbAprA33M3TT15OhpJAqxLPDzJuShKxiFwY,84
|
|
126
126
|
deepeval/benchmarks/truthful_qa/task.py,sha256=PmfPbqINd9wizq8Tpk8pwms9TersoGlMGBqxpTmZhcc,1360
|
|
127
127
|
deepeval/benchmarks/truthful_qa/template.py,sha256=5y6mfJm9AXnQL7xwrfsZjH080GwO1kd_1GdTzDCoYgo,4465
|
|
128
|
-
deepeval/benchmarks/truthful_qa/truthful_qa.py,sha256=
|
|
128
|
+
deepeval/benchmarks/truthful_qa/truthful_qa.py,sha256=2r-xcFnzSSJds3ZGxYogBzjFFrCfJxYpXzKrpE8cC_c,13781
|
|
129
129
|
deepeval/benchmarks/utils.py,sha256=NHImqH22mv108_CKM7ajTpu4hOeUhr5xPicbf0i2qGk,287
|
|
130
130
|
deepeval/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
131
|
deepeval/benchmarks/winogrande/template.py,sha256=tDwH8NpNF9x7FbDmQw45XaW1LNqGBV6zP5pwV1uOlwM,2089
|
|
@@ -138,7 +138,7 @@ deepeval/cli/test.py,sha256=kSIFMRTAfVzBJ4OitwvT829-ylV7UzPMP57P2DePS-Q,5482
|
|
|
138
138
|
deepeval/cli/types.py,sha256=_7KdthstHNc-JKCWrfpDQCf_j8h9PMxh0qJCHmVXJr0,310
|
|
139
139
|
deepeval/cli/utils.py,sha256=F4-yuONzk4ojDoSLjI9RYERB7HOD412iZ2lNlSCq4wk,5601
|
|
140
140
|
deepeval/confident/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
141
|
-
deepeval/confident/api.py,sha256
|
|
141
|
+
deepeval/confident/api.py,sha256=-2i3IBLtj5bUIImwOF6ltGVR3ZyViIbIC38XxwWvf54,8318
|
|
142
142
|
deepeval/confident/types.py,sha256=-slFhDof_1maMgpLxqDRZv6kz6ZVY2hP_0uj_aveJKU,533
|
|
143
143
|
deepeval/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
144
144
|
deepeval/config/settings.py,sha256=e7sk6_3I14hG457e75DoJd9Ojo3rOkpBZzsMYlj4gKQ,18139
|
|
@@ -179,9 +179,9 @@ deepeval/integrations/llama_index/__init__.py,sha256=zBwUFQXDp6QFtp1cfANy8ucV08r
|
|
|
179
179
|
deepeval/integrations/llama_index/agent/patched.py,sha256=4JbH0WQmt4lct7xxIH0phj8_Y-V35dgVv7DEDXK0jZI,2149
|
|
180
180
|
deepeval/integrations/llama_index/handler.py,sha256=eqI1n8E4MsvfKoFs5Zrm9IdCR7g9eBgNedISs7UkU_I,8947
|
|
181
181
|
deepeval/integrations/llama_index/utils.py,sha256=mxW71-3PjvBvJpLIU0kNWuTzCidy5l_-roLt8ZyWYA0,2599
|
|
182
|
-
deepeval/integrations/pydantic_ai/__init__.py,sha256=
|
|
183
|
-
deepeval/integrations/pydantic_ai/
|
|
184
|
-
deepeval/integrations/pydantic_ai/
|
|
182
|
+
deepeval/integrations/pydantic_ai/__init__.py,sha256=36fBKBLRo1y5jFlj0Y4xhDJsiq4ZnqtmFO32R90Azo4,96
|
|
183
|
+
deepeval/integrations/pydantic_ai/otel.py,sha256=2DpO3RapdztXPlT9BWhQfF4dJDMyp2X7YvuplJ0SwC8,1661
|
|
184
|
+
deepeval/integrations/pydantic_ai/patcher.py,sha256=wszU2YROZAQovyz1ZNRvTtsuJ5By_x4SF6yjtmItcNk,12210
|
|
185
185
|
deepeval/key_handler.py,sha256=damdQEBLGy4IVk5DR5-E3blIZdLbcMtyeGAFn_4_SG4,6505
|
|
186
186
|
deepeval/metrics/__init__.py,sha256=xofaK_bJq0QCSerSWYjHYRXXch9YQwZHxIfVAv1G7fo,4012
|
|
187
187
|
deepeval/metrics/answer_relevancy/__init__.py,sha256=WbZUpoSg2GQoqJ4VIRirVVQ1JDx5xwT-RskwqNKfWGM,46
|
|
@@ -373,8 +373,8 @@ deepeval/models/llms/anthropic_model.py,sha256=T55-jKRbM3_B3Db9M3ruklm2cVVU1JDGA
|
|
|
373
373
|
deepeval/models/llms/azure_model.py,sha256=MG6sVGUgIy2RURwFWvRP7O_RF6QAg2dpqXIJhIsgY60,10994
|
|
374
374
|
deepeval/models/llms/deepseek_model.py,sha256=mz0U0uqazAVr8vv8SF74GRTr4ZEVc3Q1v9o5TVbmz_8,5440
|
|
375
375
|
deepeval/models/llms/gemini_model.py,sha256=QXf9mjopfWwJxpm0gbkXo6op_Wtu1GaIt1BfzS3OU8Q,8174
|
|
376
|
-
deepeval/models/llms/grok_model.py,sha256=
|
|
377
|
-
deepeval/models/llms/kimi_model.py,sha256=
|
|
376
|
+
deepeval/models/llms/grok_model.py,sha256=zPBmPnNCRGrtg_709gFv5A4iz7WilTmDpAyOpjXTa_M,5986
|
|
377
|
+
deepeval/models/llms/kimi_model.py,sha256=ZcvEwWgnv1dtmbq7LgMQJAjpkjxZr-l5eBi9KGqRbb0,6726
|
|
378
378
|
deepeval/models/llms/litellm_model.py,sha256=iu4-_JCpd9LdEa-eCWseD2iLTA-r7OSgYGWQ0IxB4eA,11527
|
|
379
379
|
deepeval/models/llms/local_model.py,sha256=PeF6ofMR8jBMTLzkCkgmkBJix9kHbWV5vTKGx8nehFs,3605
|
|
380
380
|
deepeval/models/llms/ollama_model.py,sha256=foL6sMza37Z0HH8qPStyIr1g-xEaD6Ce53L2C8Er-P8,3055
|
|
@@ -454,13 +454,13 @@ deepeval/tracing/otel/__init__.py,sha256=HQsaF5yLPwyW5qg8AOV81_nG_7pFHnatOTHi9Wx
|
|
|
454
454
|
deepeval/tracing/otel/exporter.py,sha256=dXQd834zm5rm1ss9pWkBBlk-JSdtiw7aFLso2hM53XY,26372
|
|
455
455
|
deepeval/tracing/otel/utils.py,sha256=g8yAzhqbPh1fOKCWkfNekC6AVotLfu1SUcfNMo6zii8,9786
|
|
456
456
|
deepeval/tracing/patchers.py,sha256=DAPNkhrDtoeyJIVeQDUMhTz-xGcXu00eqjQZmov8FiU,3096
|
|
457
|
-
deepeval/tracing/perf_epoch_bridge.py,sha256=
|
|
457
|
+
deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
|
|
458
458
|
deepeval/tracing/tracing.py,sha256=StvwFEG3MG67n7PBEyDDycdj0myMbP3LMB_FBhaZH-Y,38741
|
|
459
459
|
deepeval/tracing/types.py,sha256=3w5HEI6y4zuzVr8xGEEzDviLZCX_s_pK85qbwnyf1aY,5196
|
|
460
460
|
deepeval/tracing/utils.py,sha256=eTEickbDvRiOu1twNolh4sHnjZF49vqdLgI74BudeTw,6357
|
|
461
|
-
deepeval/utils.py,sha256=
|
|
462
|
-
deepeval-3.
|
|
463
|
-
deepeval-3.
|
|
464
|
-
deepeval-3.
|
|
465
|
-
deepeval-3.
|
|
466
|
-
deepeval-3.
|
|
461
|
+
deepeval/utils.py,sha256=EimWDwI1pKCE8vl6kuTnGbGT6ep9zHL5sZ0o-gj49XI,16857
|
|
462
|
+
deepeval-3.5.0.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
|
|
463
|
+
deepeval-3.5.0.dist-info/METADATA,sha256=KBAB5m11q4GAhVwCJBmXZDtaYtKoAO3sQ0vg-ajFRLg,18682
|
|
464
|
+
deepeval-3.5.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
465
|
+
deepeval-3.5.0.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
|
|
466
|
+
deepeval-3.5.0.dist-info/RECORD,,
|
|
@@ -1,364 +0,0 @@
|
|
|
1
|
-
from deepeval.telemetry import capture_tracing_integration
|
|
2
|
-
from deepeval.metrics import BaseMetric
|
|
3
|
-
from typing import List, Optional
|
|
4
|
-
import functools
|
|
5
|
-
import inspect
|
|
6
|
-
import json
|
|
7
|
-
from deepeval.test_case import LLMTestCase
|
|
8
|
-
from deepeval.tracing.types import TestCaseMetricPair
|
|
9
|
-
from deepeval.tracing.tracing import trace_manager
|
|
10
|
-
from deepeval.tracing.otel.utils import parse_string, parse_list_of_strings
|
|
11
|
-
from opentelemetry import trace
|
|
12
|
-
|
|
13
|
-
try:
|
|
14
|
-
from opentelemetry.trace import NoOpTracer
|
|
15
|
-
|
|
16
|
-
opentelemetry_installed = True
|
|
17
|
-
except:
|
|
18
|
-
opentelemetry_installed = False
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def is_opentelemetry_available():
|
|
22
|
-
if not opentelemetry_installed:
|
|
23
|
-
raise ImportError(
|
|
24
|
-
"OpenTelemetry SDK is not available. Please install it with `pip install opentelemetry-sdk`."
|
|
25
|
-
)
|
|
26
|
-
return True
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
try:
|
|
30
|
-
from pydantic_ai.agent import Agent
|
|
31
|
-
from pydantic_ai.models.instrumented import InstrumentedModel
|
|
32
|
-
|
|
33
|
-
pydantic_ai_installed = True
|
|
34
|
-
except:
|
|
35
|
-
pydantic_ai_installed = False
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def is_pydantic_ai_installed():
|
|
39
|
-
if not pydantic_ai_installed:
|
|
40
|
-
raise ImportError(
|
|
41
|
-
"Pydantic AI is not installed. Please install it with `pip install pydantic-ai`."
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class PydanticAIAgent(Agent):
|
|
46
|
-
def __init__(self, *args, **kwargs):
|
|
47
|
-
with capture_tracing_integration("pydantic_ai.agent.PydanticAIAgent"):
|
|
48
|
-
is_pydantic_ai_installed()
|
|
49
|
-
is_opentelemetry_available()
|
|
50
|
-
|
|
51
|
-
super().__init__(*args, **kwargs)
|
|
52
|
-
|
|
53
|
-
# attributes to be set if ran synchronously
|
|
54
|
-
self.metric_collection: str = None
|
|
55
|
-
self.metrics: list[BaseMetric] = None
|
|
56
|
-
|
|
57
|
-
# trace attributes to be set if ran synchronously
|
|
58
|
-
self._trace_name: str = None
|
|
59
|
-
self._trace_tags: list[str] = None
|
|
60
|
-
self._trace_metadata: dict = None
|
|
61
|
-
self._trace_thread_id: str = None
|
|
62
|
-
self._trace_user_id: str = None
|
|
63
|
-
|
|
64
|
-
# Patch the run method only for this instance
|
|
65
|
-
self._patch_run_method()
|
|
66
|
-
self._patch_run_method_sync()
|
|
67
|
-
self._patch_tool_decorator()
|
|
68
|
-
|
|
69
|
-
def _patch_tool_decorator(self):
|
|
70
|
-
"""Patch the tool decorator to print input and output"""
|
|
71
|
-
original_tool = self.tool
|
|
72
|
-
|
|
73
|
-
@functools.wraps(original_tool)
|
|
74
|
-
def patched_tool(
|
|
75
|
-
*args,
|
|
76
|
-
metric_collection: Optional[str] = None,
|
|
77
|
-
metrics: Optional[List[BaseMetric]] = None,
|
|
78
|
-
**kwargs
|
|
79
|
-
):
|
|
80
|
-
|
|
81
|
-
# Check if function is in args (direct decoration: @agent.tool)
|
|
82
|
-
if args and callable(args[0]):
|
|
83
|
-
original_func = args[0]
|
|
84
|
-
patched_func = self._create_patched_function(
|
|
85
|
-
original_func, metric_collection, metrics
|
|
86
|
-
)
|
|
87
|
-
new_args = (patched_func,) + args[1:]
|
|
88
|
-
result = original_tool(*new_args, **kwargs)
|
|
89
|
-
return result
|
|
90
|
-
else:
|
|
91
|
-
# Decorator called with parameters: @agent.tool(metric_collection="...")
|
|
92
|
-
# Return a decorator that will receive the function
|
|
93
|
-
def decorator_with_params(func):
|
|
94
|
-
patched_func = self._create_patched_function(
|
|
95
|
-
func, metric_collection, metrics
|
|
96
|
-
)
|
|
97
|
-
return original_tool(patched_func, **kwargs)
|
|
98
|
-
|
|
99
|
-
return decorator_with_params
|
|
100
|
-
|
|
101
|
-
# Replace the tool method for this instance
|
|
102
|
-
self.tool = patched_tool
|
|
103
|
-
|
|
104
|
-
def _create_patched_function(
|
|
105
|
-
self, original_func, metric_collection, metrics
|
|
106
|
-
):
|
|
107
|
-
"""Create a patched version of the function that adds tracing"""
|
|
108
|
-
if inspect.iscoroutinefunction(original_func):
|
|
109
|
-
|
|
110
|
-
@functools.wraps(original_func)
|
|
111
|
-
async def patched_async_func(*func_args, **func_kwargs):
|
|
112
|
-
result = await original_func(*func_args, **func_kwargs)
|
|
113
|
-
|
|
114
|
-
current_span = trace.get_current_span()
|
|
115
|
-
if current_span.is_recording():
|
|
116
|
-
try:
|
|
117
|
-
result_str = str(result)
|
|
118
|
-
except Exception:
|
|
119
|
-
result_str = ""
|
|
120
|
-
current_span.set_attribute(
|
|
121
|
-
"confident.span.output", result_str
|
|
122
|
-
)
|
|
123
|
-
if metric_collection:
|
|
124
|
-
current_span.set_attribute(
|
|
125
|
-
"confident.span.metric_collection",
|
|
126
|
-
metric_collection,
|
|
127
|
-
)
|
|
128
|
-
# TODO: add metrics in component level evals
|
|
129
|
-
return result
|
|
130
|
-
|
|
131
|
-
return patched_async_func
|
|
132
|
-
else:
|
|
133
|
-
|
|
134
|
-
@functools.wraps(original_func)
|
|
135
|
-
def patched_sync_func(*func_args, **func_kwargs):
|
|
136
|
-
result = original_func(*func_args, **func_kwargs)
|
|
137
|
-
|
|
138
|
-
current_span = trace.get_current_span()
|
|
139
|
-
if current_span.is_recording():
|
|
140
|
-
try:
|
|
141
|
-
result_str = str(result)
|
|
142
|
-
except Exception:
|
|
143
|
-
result_str = ""
|
|
144
|
-
current_span.set_attribute(
|
|
145
|
-
"confident.span.output", result_str
|
|
146
|
-
)
|
|
147
|
-
if metric_collection:
|
|
148
|
-
current_span.set_attribute(
|
|
149
|
-
"confident.span.metric_collection",
|
|
150
|
-
metric_collection,
|
|
151
|
-
)
|
|
152
|
-
# TODO: add metrics in component level evals
|
|
153
|
-
return result
|
|
154
|
-
|
|
155
|
-
return patched_sync_func
|
|
156
|
-
|
|
157
|
-
def _patch_run_method(self):
|
|
158
|
-
"""Patch the Agent.run method only for this PydanticAIAgent instance"""
|
|
159
|
-
original_run = self.run
|
|
160
|
-
|
|
161
|
-
@functools.wraps(original_run)
|
|
162
|
-
async def patched_run(
|
|
163
|
-
*args,
|
|
164
|
-
metric_collection=None,
|
|
165
|
-
metrics=None,
|
|
166
|
-
trace_name=None,
|
|
167
|
-
trace_tags=None,
|
|
168
|
-
trace_metadata=None,
|
|
169
|
-
trace_thread_id=None,
|
|
170
|
-
trace_user_id=None,
|
|
171
|
-
**kwargs
|
|
172
|
-
):
|
|
173
|
-
# extract and validate flattened arguments - use safe pop with defaults
|
|
174
|
-
metric_collection = parse_string(metric_collection)
|
|
175
|
-
trace_name = parse_string(trace_name)
|
|
176
|
-
trace_tags = parse_list_of_strings(trace_tags)
|
|
177
|
-
trace_thread_id = parse_string(trace_thread_id)
|
|
178
|
-
trace_user_id = parse_string(trace_user_id)
|
|
179
|
-
|
|
180
|
-
if metrics is not None and not (
|
|
181
|
-
isinstance(metrics, list)
|
|
182
|
-
and all(isinstance(m, BaseMetric) for m in metrics)
|
|
183
|
-
):
|
|
184
|
-
raise TypeError(
|
|
185
|
-
"metrics must be a list of BaseMetric instances"
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
if trace_metadata is not None and not isinstance(
|
|
189
|
-
trace_metadata, dict
|
|
190
|
-
):
|
|
191
|
-
raise TypeError("trace_metadata must be a dictionary")
|
|
192
|
-
|
|
193
|
-
model = kwargs.get("model", None)
|
|
194
|
-
infer_name = kwargs.get("infer_name", True)
|
|
195
|
-
|
|
196
|
-
if infer_name and self.name is None:
|
|
197
|
-
self._infer_name(inspect.currentframe())
|
|
198
|
-
model_used = self._get_model(model)
|
|
199
|
-
del model
|
|
200
|
-
|
|
201
|
-
if isinstance(model_used, InstrumentedModel):
|
|
202
|
-
tracer = model_used.instrumentation_settings.tracer
|
|
203
|
-
else:
|
|
204
|
-
tracer = NoOpTracer()
|
|
205
|
-
|
|
206
|
-
with tracer.start_as_current_span("agent") as run_span:
|
|
207
|
-
result = await original_run(*args, **kwargs)
|
|
208
|
-
|
|
209
|
-
name = "agent"
|
|
210
|
-
if self.name:
|
|
211
|
-
name = str(self.name)
|
|
212
|
-
|
|
213
|
-
input = ""
|
|
214
|
-
if isinstance(args[0], str):
|
|
215
|
-
input = args[0]
|
|
216
|
-
elif isinstance(args[0], list) and all(
|
|
217
|
-
isinstance(i, str) for i in args[0]
|
|
218
|
-
):
|
|
219
|
-
input = args[0]
|
|
220
|
-
|
|
221
|
-
output = ""
|
|
222
|
-
try:
|
|
223
|
-
output = str(result.output)
|
|
224
|
-
except Exception:
|
|
225
|
-
pass
|
|
226
|
-
|
|
227
|
-
# set agent span attributes
|
|
228
|
-
run_span.set_attribute("confident.span.type", "agent")
|
|
229
|
-
run_span.set_attribute("confident.agent.name", name)
|
|
230
|
-
run_span.set_attribute("confident.agent.input", input)
|
|
231
|
-
run_span.set_attribute("confident.agent.output", output)
|
|
232
|
-
|
|
233
|
-
# fallback for input and output not being set
|
|
234
|
-
run_span.set_attribute("confident.span.input", input)
|
|
235
|
-
run_span.set_attribute("confident.span.output", output)
|
|
236
|
-
|
|
237
|
-
if metric_collection: # flattened argument to be replaced
|
|
238
|
-
run_span.set_attribute(
|
|
239
|
-
"confident.span.metric_collection", metric_collection
|
|
240
|
-
)
|
|
241
|
-
elif self.metric_collection: # for run_sync
|
|
242
|
-
run_span.set_attribute(
|
|
243
|
-
"confident.span.metric_collection",
|
|
244
|
-
self.metric_collection,
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
# set the flattened trace attributes
|
|
248
|
-
if trace_name:
|
|
249
|
-
run_span.set_attribute("confident.trace.name", trace_name)
|
|
250
|
-
if trace_tags:
|
|
251
|
-
run_span.set_attribute("confident.trace.tags", trace_tags)
|
|
252
|
-
if trace_metadata:
|
|
253
|
-
run_span.set_attribute(
|
|
254
|
-
"confident.trace.metadata", json.dumps(trace_metadata)
|
|
255
|
-
)
|
|
256
|
-
if trace_thread_id:
|
|
257
|
-
run_span.set_attribute(
|
|
258
|
-
"confident.trace.thread_id", trace_thread_id
|
|
259
|
-
)
|
|
260
|
-
if trace_user_id:
|
|
261
|
-
run_span.set_attribute(
|
|
262
|
-
"confident.trace.user_id", trace_user_id
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
# for run_sync
|
|
266
|
-
if self._trace_name:
|
|
267
|
-
run_span.set_attribute(
|
|
268
|
-
"confident.trace.name", self._trace_name
|
|
269
|
-
)
|
|
270
|
-
if self._trace_tags:
|
|
271
|
-
run_span.set_attribute(
|
|
272
|
-
"confident.trace.tags", self._trace_tags
|
|
273
|
-
)
|
|
274
|
-
if self._trace_metadata:
|
|
275
|
-
run_span.set_attribute(
|
|
276
|
-
"confident.trace.metadata",
|
|
277
|
-
json.dumps(self._trace_metadata),
|
|
278
|
-
)
|
|
279
|
-
if self._trace_thread_id:
|
|
280
|
-
run_span.set_attribute(
|
|
281
|
-
"confident.trace.thread_id", self._trace_thread_id
|
|
282
|
-
)
|
|
283
|
-
if self._trace_user_id:
|
|
284
|
-
run_span.set_attribute(
|
|
285
|
-
"confident.trace.user_id", self._trace_user_id
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
if metrics: # flattened argument to be replaced
|
|
289
|
-
trace_manager.test_case_metrics.append(
|
|
290
|
-
TestCaseMetricPair(
|
|
291
|
-
test_case=LLMTestCase(
|
|
292
|
-
input=input, actual_output=output
|
|
293
|
-
),
|
|
294
|
-
metrics=metrics,
|
|
295
|
-
)
|
|
296
|
-
)
|
|
297
|
-
elif self.metrics: # for run_sync
|
|
298
|
-
trace_manager.test_case_metrics.append(
|
|
299
|
-
TestCaseMetricPair(
|
|
300
|
-
test_case=LLMTestCase(
|
|
301
|
-
input=input, actual_output=output
|
|
302
|
-
),
|
|
303
|
-
metrics=self.metrics,
|
|
304
|
-
)
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
return result
|
|
308
|
-
|
|
309
|
-
# Replace the method only for this instance
|
|
310
|
-
self.run = patched_run
|
|
311
|
-
|
|
312
|
-
def _patch_run_method_sync(self):
|
|
313
|
-
"""Patch the Agent.run method only for this PydanticAIAgent instance"""
|
|
314
|
-
original_run = self.run_sync
|
|
315
|
-
|
|
316
|
-
@functools.wraps(original_run)
|
|
317
|
-
def patched_run(
|
|
318
|
-
*args,
|
|
319
|
-
metric_collection=None,
|
|
320
|
-
metrics=None,
|
|
321
|
-
trace_name=None,
|
|
322
|
-
trace_tags=None,
|
|
323
|
-
trace_metadata=None,
|
|
324
|
-
trace_thread_id=None,
|
|
325
|
-
trace_user_id=None,
|
|
326
|
-
**kwargs
|
|
327
|
-
):
|
|
328
|
-
metric_collection = parse_string(metric_collection)
|
|
329
|
-
trace_name = parse_string(trace_name)
|
|
330
|
-
trace_tags = parse_list_of_strings(trace_tags)
|
|
331
|
-
trace_thread_id = parse_string(trace_thread_id)
|
|
332
|
-
trace_user_id = parse_string(trace_user_id)
|
|
333
|
-
|
|
334
|
-
if metrics is not None and not (
|
|
335
|
-
isinstance(metrics, list)
|
|
336
|
-
and all(isinstance(m, BaseMetric) for m in metrics)
|
|
337
|
-
):
|
|
338
|
-
raise TypeError(
|
|
339
|
-
"metrics must be a list of BaseMetric instances"
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
if trace_metadata is not None and not isinstance(
|
|
343
|
-
trace_metadata, dict
|
|
344
|
-
):
|
|
345
|
-
raise TypeError("trace_metadata must be a dictionary")
|
|
346
|
-
|
|
347
|
-
# attributes to be set if ran synchronously
|
|
348
|
-
if metric_collection:
|
|
349
|
-
self.metric_collection = metric_collection
|
|
350
|
-
if metrics:
|
|
351
|
-
self.metrics = metrics
|
|
352
|
-
|
|
353
|
-
self._trace_name = trace_name
|
|
354
|
-
self._trace_tags = trace_tags
|
|
355
|
-
self._trace_metadata = trace_metadata
|
|
356
|
-
self._trace_thread_id = trace_thread_id
|
|
357
|
-
self._trace_user_id = trace_user_id
|
|
358
|
-
|
|
359
|
-
result = original_run(*args, **kwargs)
|
|
360
|
-
|
|
361
|
-
return result
|
|
362
|
-
|
|
363
|
-
# Replace the method only for this instance
|
|
364
|
-
self.run_sync = patched_run
|
|
File without changes
|
|
File without changes
|
|
File without changes
|