deepeval 3.4.9__py3-none-any.whl → 3.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/drop/drop.py +2 -3
- deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
- deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
- deepeval/benchmarks/math_qa/math_qa.py +2 -2
- deepeval/benchmarks/mmlu/mmlu.py +2 -2
- deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
- deepeval/confident/api.py +3 -0
- deepeval/integrations/langchain/callback.py +21 -0
- deepeval/integrations/pydantic_ai/__init__.py +2 -4
- deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
- deepeval/integrations/pydantic_ai/patcher.py +376 -0
- deepeval/metrics/__init__.py +1 -1
- deepeval/metrics/answer_relevancy/template.py +13 -38
- deepeval/metrics/faithfulness/template.py +17 -27
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/kimi_model.py +1 -1
- deepeval/prompt/api.py +22 -4
- deepeval/prompt/prompt.py +131 -17
- deepeval/synthesizer/synthesizer.py +17 -9
- deepeval/tracing/api.py +3 -0
- deepeval/tracing/context.py +3 -1
- deepeval/tracing/perf_epoch_bridge.py +4 -4
- deepeval/tracing/tracing.py +12 -2
- deepeval/tracing/types.py +3 -0
- deepeval/tracing/utils.py +6 -2
- deepeval/utils.py +2 -2
- {deepeval-3.4.9.dist-info → deepeval-3.5.1.dist-info}/METADATA +14 -13
- {deepeval-3.4.9.dist-info → deepeval-3.5.1.dist-info}/RECORD +32 -32
- deepeval/integrations/pydantic_ai/agent.py +0 -364
- {deepeval-3.4.9.dist-info → deepeval-3.5.1.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.9.dist-info → deepeval-3.5.1.dist-info}/WHEEL +0 -0
- {deepeval-3.4.9.dist-info → deepeval-3.5.1.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.
|
|
1
|
+
__version__: str = "3.5.1"
|
deepeval/benchmarks/drop/drop.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
|
-
from typing import Union
|
|
4
3
|
|
|
5
4
|
from deepeval.dataset import Golden
|
|
6
5
|
from deepeval.benchmarks.base_benchmark import (
|
|
@@ -50,7 +49,7 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
50
49
|
self,
|
|
51
50
|
model: DeepEvalBaseLLM,
|
|
52
51
|
*args,
|
|
53
|
-
batch_size: int
|
|
52
|
+
batch_size: Union[int, None] = None,
|
|
54
53
|
**kwargs,
|
|
55
54
|
) -> DeepEvalBaseBenchmarkResult:
|
|
56
55
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Dict, Optional
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -51,7 +51,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
|
|
|
51
51
|
self,
|
|
52
52
|
model: DeepEvalBaseLLM,
|
|
53
53
|
*args,
|
|
54
|
-
batch_size: int
|
|
54
|
+
batch_size: Union[int, None] = None,
|
|
55
55
|
**kwargs,
|
|
56
56
|
) -> DeepEvalBaseBenchmarkResult:
|
|
57
57
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
import requests
|
|
4
4
|
import json
|
|
@@ -52,7 +52,7 @@ class LogiQA(DeepEvalBaseBenchmark):
|
|
|
52
52
|
self,
|
|
53
53
|
model: DeepEvalBaseLLM,
|
|
54
54
|
*args,
|
|
55
|
-
batch_size: int
|
|
55
|
+
batch_size: Union[int, None] = None,
|
|
56
56
|
**kwargs,
|
|
57
57
|
) -> DeepEvalBaseBenchmarkResult:
|
|
58
58
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -50,7 +50,7 @@ class MathQA(DeepEvalBaseBenchmark):
|
|
|
50
50
|
self,
|
|
51
51
|
model: DeepEvalBaseLLM,
|
|
52
52
|
*args,
|
|
53
|
-
batch_size: int
|
|
53
|
+
batch_size: Union[int, None] = None,
|
|
54
54
|
**kwargs,
|
|
55
55
|
) -> DeepEvalBaseBenchmarkResult:
|
|
56
56
|
import pandas as pd
|
deepeval/benchmarks/mmlu/mmlu.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -49,7 +49,7 @@ class MMLU(DeepEvalBaseBenchmark):
|
|
|
49
49
|
self,
|
|
50
50
|
model: DeepEvalBaseLLM,
|
|
51
51
|
*args,
|
|
52
|
-
batch_size: int
|
|
52
|
+
batch_size: Union[int, None] = None,
|
|
53
53
|
**kwargs,
|
|
54
54
|
) -> DeepEvalBaseBenchmarkResult:
|
|
55
55
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Dict, Optional
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -59,7 +59,7 @@ class TruthfulQA(DeepEvalBaseBenchmark):
|
|
|
59
59
|
self,
|
|
60
60
|
model: DeepEvalBaseLLM,
|
|
61
61
|
*args,
|
|
62
|
-
batch_size: int
|
|
62
|
+
batch_size: Union[int, None] = None,
|
|
63
63
|
**kwargs,
|
|
64
64
|
) -> DeepEvalBaseBenchmarkResult:
|
|
65
65
|
import pandas as pd
|
deepeval/confident/api.py
CHANGED
|
@@ -10,6 +10,7 @@ from tenacity import (
|
|
|
10
10
|
retry_if_exception_type,
|
|
11
11
|
RetryCallState,
|
|
12
12
|
)
|
|
13
|
+
from pydantic import SecretStr
|
|
13
14
|
|
|
14
15
|
import deepeval
|
|
15
16
|
from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues
|
|
@@ -88,7 +89,9 @@ class Endpoints(Enum):
|
|
|
88
89
|
TEST_RUN_ENDPOINT = "/v1/test-run"
|
|
89
90
|
TRACES_ENDPOINT = "/v1/traces"
|
|
90
91
|
ANNOTATIONS_ENDPOINT = "/v1/annotations"
|
|
92
|
+
PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
|
|
91
93
|
PROMPTS_ENDPOINT = "/v1/prompts"
|
|
94
|
+
PROMPTS_VERSIONS_ENDPOINT = "/v1/prompts/:alias/versions"
|
|
92
95
|
SIMULATE_ENDPOINT = "/v1/simulate"
|
|
93
96
|
EVALUATE_ENDPOINT = "/v1/evaluate"
|
|
94
97
|
|
|
@@ -9,6 +9,7 @@ from deepeval.tracing.types import (
|
|
|
9
9
|
from deepeval.metrics import BaseMetric, TaskCompletionMetric
|
|
10
10
|
from deepeval.test_case import LLMTestCase
|
|
11
11
|
from deepeval.test_run import global_test_run_manager
|
|
12
|
+
import uuid
|
|
12
13
|
|
|
13
14
|
try:
|
|
14
15
|
from langchain_core.callbacks.base import BaseCallbackHandler
|
|
@@ -81,6 +82,26 @@ class CallbackHandler(BaseCallbackHandler):
|
|
|
81
82
|
)
|
|
82
83
|
super().__init__()
|
|
83
84
|
|
|
85
|
+
def on_llm_new_token(
|
|
86
|
+
self,
|
|
87
|
+
token: str,
|
|
88
|
+
*,
|
|
89
|
+
chunk,
|
|
90
|
+
run_id: UUID,
|
|
91
|
+
parent_run_id: Optional[UUID] = None,
|
|
92
|
+
tags: Optional[list[str]] = None,
|
|
93
|
+
**kwargs: Any,
|
|
94
|
+
):
|
|
95
|
+
llm_span: Optional[LlmSpan] = trace_manager.get_span_by_uuid(
|
|
96
|
+
str(run_id)
|
|
97
|
+
)
|
|
98
|
+
if llm_span is None:
|
|
99
|
+
return
|
|
100
|
+
if llm_span.token_intervals is None:
|
|
101
|
+
llm_span.token_intervals = {perf_counter(): token}
|
|
102
|
+
else:
|
|
103
|
+
llm_span.token_intervals[perf_counter()] = token
|
|
104
|
+
|
|
84
105
|
def check_active_trace_id(self):
|
|
85
106
|
if self.active_trace_id is None:
|
|
86
107
|
self.active_trace_id = trace_manager.start_new_trace().uuid
|
|
@@ -31,14 +31,6 @@ def instrument_pydantic_ai(api_key: Optional[str] = None):
|
|
|
31
31
|
with capture_tracing_integration("pydantic_ai"):
|
|
32
32
|
is_opentelemetry_available()
|
|
33
33
|
|
|
34
|
-
if api_key:
|
|
35
|
-
deepeval.login(api_key)
|
|
36
|
-
|
|
37
|
-
api_key = get_confident_api_key()
|
|
38
|
-
|
|
39
|
-
if not api_key:
|
|
40
|
-
raise ValueError("No api key provided.")
|
|
41
|
-
|
|
42
34
|
# create a new tracer provider
|
|
43
35
|
tracer_provider = TracerProvider()
|
|
44
36
|
tracer_provider.add_span_processor(
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import deepeval
|
|
3
|
+
from deepeval.tracing.types import LlmOutput, LlmToolCall
|
|
4
|
+
from pydantic_ai.agent import AgentRunResult
|
|
5
|
+
from deepeval.tracing.context import current_trace_context
|
|
6
|
+
from deepeval.tracing.types import AgentSpan, LlmSpan
|
|
7
|
+
from deepeval.tracing.tracing import Observer
|
|
8
|
+
from typing import List, Callable, Optional, Any
|
|
9
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
10
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
11
|
+
from deepeval.confident.api import get_confident_api_key
|
|
12
|
+
from deepeval.integrations.pydantic_ai.otel import instrument_pydantic_ai
|
|
13
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
14
|
+
from deepeval.prompt import Prompt
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from pydantic_ai.agent import Agent
|
|
18
|
+
from pydantic_ai.models import Model
|
|
19
|
+
from pydantic_ai.messages import (
|
|
20
|
+
ModelResponse,
|
|
21
|
+
ModelRequest,
|
|
22
|
+
ModelResponsePart,
|
|
23
|
+
TextPart,
|
|
24
|
+
ToolCallPart,
|
|
25
|
+
SystemPromptPart,
|
|
26
|
+
ToolReturnPart,
|
|
27
|
+
UserPromptPart,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
pydantic_ai_installed = True
|
|
31
|
+
except:
|
|
32
|
+
pydantic_ai_installed = True
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _patch_agent_tool_decorator():
|
|
36
|
+
original_tool = Agent.tool
|
|
37
|
+
|
|
38
|
+
@functools.wraps(original_tool)
|
|
39
|
+
def wrapper(
|
|
40
|
+
*args,
|
|
41
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
42
|
+
metric_collection: Optional[str] = None,
|
|
43
|
+
**kwargs
|
|
44
|
+
):
|
|
45
|
+
# Case 1: Direct decoration - @agent.tool
|
|
46
|
+
if args and callable(args[0]):
|
|
47
|
+
patched_func = _create_patched_tool(
|
|
48
|
+
args[0], metrics, metric_collection
|
|
49
|
+
)
|
|
50
|
+
new_args = (patched_func,) + args[1:]
|
|
51
|
+
return original_tool(*new_args, **kwargs)
|
|
52
|
+
|
|
53
|
+
# Case 2: Decoration with arguments - @agent.tool(metrics=..., metric_collection=...)
|
|
54
|
+
else:
|
|
55
|
+
# Return a decorator function that will receive the actual function
|
|
56
|
+
def decorator(func):
|
|
57
|
+
patched_func = _create_patched_tool(
|
|
58
|
+
func, metrics, metric_collection
|
|
59
|
+
)
|
|
60
|
+
return original_tool(*args, **kwargs)(patched_func)
|
|
61
|
+
|
|
62
|
+
return decorator
|
|
63
|
+
|
|
64
|
+
Agent.tool = wrapper
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _create_patched_tool(
|
|
68
|
+
func: Callable,
|
|
69
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
70
|
+
metric_collection: Optional[str] = None,
|
|
71
|
+
):
|
|
72
|
+
import asyncio
|
|
73
|
+
|
|
74
|
+
original_func = func
|
|
75
|
+
|
|
76
|
+
is_async = asyncio.iscoroutinefunction(original_func)
|
|
77
|
+
|
|
78
|
+
if is_async:
|
|
79
|
+
|
|
80
|
+
@functools.wraps(original_func)
|
|
81
|
+
async def async_wrapper(*args, **kwargs):
|
|
82
|
+
with Observer(
|
|
83
|
+
span_type="tool",
|
|
84
|
+
func_name=original_func.__name__,
|
|
85
|
+
metrics=metrics,
|
|
86
|
+
metric_collection=metric_collection,
|
|
87
|
+
function_kwargs={"args": args, **kwargs},
|
|
88
|
+
) as observer:
|
|
89
|
+
result = await original_func(*args, **kwargs)
|
|
90
|
+
observer.result = result
|
|
91
|
+
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
return async_wrapper
|
|
95
|
+
else:
|
|
96
|
+
|
|
97
|
+
@functools.wraps(original_func)
|
|
98
|
+
def sync_wrapper(*args, **kwargs):
|
|
99
|
+
with Observer(
|
|
100
|
+
span_type="tool",
|
|
101
|
+
func_name=original_func.__name__,
|
|
102
|
+
metrics=metrics,
|
|
103
|
+
metric_collection=metric_collection,
|
|
104
|
+
function_kwargs={"args": args, **kwargs},
|
|
105
|
+
) as observer:
|
|
106
|
+
result = original_func(*args, **kwargs)
|
|
107
|
+
observer.result = result
|
|
108
|
+
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
return sync_wrapper
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _patch_agent_init():
|
|
115
|
+
original_init = Agent.__init__
|
|
116
|
+
|
|
117
|
+
@functools.wraps(original_init)
|
|
118
|
+
def wrapper(
|
|
119
|
+
self,
|
|
120
|
+
*args,
|
|
121
|
+
llm_metric_collection: Optional[str] = None,
|
|
122
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
123
|
+
llm_prompt: Optional[Prompt] = None,
|
|
124
|
+
agent_metric_collection: Optional[str] = None,
|
|
125
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
126
|
+
**kwargs
|
|
127
|
+
):
|
|
128
|
+
result = original_init(self, *args, **kwargs)
|
|
129
|
+
_patch_llm_model(
|
|
130
|
+
self._model, llm_metric_collection, llm_metrics, llm_prompt
|
|
131
|
+
) # runtime patch of the model
|
|
132
|
+
_patch_agent_run(agent_metric_collection, agent_metrics)
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
Agent.__init__ = wrapper
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _patch_agent_run(
|
|
139
|
+
agent_metric_collection: Optional[str] = None,
|
|
140
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
141
|
+
):
|
|
142
|
+
original_run = Agent.run
|
|
143
|
+
|
|
144
|
+
@functools.wraps(original_run)
|
|
145
|
+
async def wrapper(
|
|
146
|
+
*args,
|
|
147
|
+
trace_metric_collection: Optional[str] = None,
|
|
148
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
149
|
+
trace_name: Optional[str] = None,
|
|
150
|
+
trace_tags: Optional[List[str]] = None,
|
|
151
|
+
trace_metadata: Optional[dict] = None,
|
|
152
|
+
trace_thread_id: Optional[str] = None,
|
|
153
|
+
trace_user_id: Optional[str] = None,
|
|
154
|
+
**kwargs
|
|
155
|
+
):
|
|
156
|
+
with Observer(
|
|
157
|
+
span_type="agent",
|
|
158
|
+
func_name="Agent",
|
|
159
|
+
function_kwargs={"input": args[1]},
|
|
160
|
+
metrics=agent_metrics,
|
|
161
|
+
metric_collection=agent_metric_collection,
|
|
162
|
+
) as observer:
|
|
163
|
+
result = await original_run(*args, **kwargs)
|
|
164
|
+
observer.update_span_properties = (
|
|
165
|
+
lambda agent_span: set_agent_span_attributes(agent_span, result)
|
|
166
|
+
)
|
|
167
|
+
observer.result = result.output
|
|
168
|
+
|
|
169
|
+
_update_trace_context(
|
|
170
|
+
trace_name=trace_name,
|
|
171
|
+
trace_tags=trace_tags,
|
|
172
|
+
trace_metadata=trace_metadata,
|
|
173
|
+
trace_thread_id=trace_thread_id,
|
|
174
|
+
trace_user_id=trace_user_id,
|
|
175
|
+
trace_metric_collection=trace_metric_collection,
|
|
176
|
+
trace_metrics=trace_metrics,
|
|
177
|
+
trace_input=args[1],
|
|
178
|
+
trace_output=result.output,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
Agent.run = wrapper
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _update_trace_context(
|
|
187
|
+
trace_name: Optional[str] = None,
|
|
188
|
+
trace_tags: Optional[List[str]] = None,
|
|
189
|
+
trace_metadata: Optional[dict] = None,
|
|
190
|
+
trace_thread_id: Optional[str] = None,
|
|
191
|
+
trace_user_id: Optional[str] = None,
|
|
192
|
+
trace_metric_collection: Optional[str] = None,
|
|
193
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
194
|
+
trace_input: Optional[Any] = None,
|
|
195
|
+
trace_output: Optional[Any] = None,
|
|
196
|
+
):
|
|
197
|
+
|
|
198
|
+
current_trace = current_trace_context.get()
|
|
199
|
+
current_trace.name = trace_name
|
|
200
|
+
current_trace.tags = trace_tags
|
|
201
|
+
current_trace.metadata = trace_metadata
|
|
202
|
+
current_trace.thread_id = trace_thread_id
|
|
203
|
+
current_trace.user_id = trace_user_id
|
|
204
|
+
current_trace.metric_collection = trace_metric_collection
|
|
205
|
+
current_trace.metrics = trace_metrics
|
|
206
|
+
current_trace.input = trace_input
|
|
207
|
+
current_trace.output = trace_output
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _patch_llm_model(
|
|
211
|
+
model: Model,
|
|
212
|
+
llm_metric_collection: Optional[str] = None,
|
|
213
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
214
|
+
llm_prompt: Optional[Prompt] = None,
|
|
215
|
+
):
|
|
216
|
+
original_func = model.request
|
|
217
|
+
try:
|
|
218
|
+
model_name = model.model_name
|
|
219
|
+
except Exception:
|
|
220
|
+
model_name = "unknown"
|
|
221
|
+
|
|
222
|
+
@functools.wraps(original_func)
|
|
223
|
+
async def wrapper(*args, **kwargs):
|
|
224
|
+
with Observer(
|
|
225
|
+
span_type="llm",
|
|
226
|
+
func_name="LLM",
|
|
227
|
+
observe_kwargs={"model": model_name},
|
|
228
|
+
metrics=llm_metrics,
|
|
229
|
+
metric_collection=llm_metric_collection,
|
|
230
|
+
) as observer:
|
|
231
|
+
result = await original_func(*args, **kwargs)
|
|
232
|
+
request = kwargs.get("messages", [])
|
|
233
|
+
if not request:
|
|
234
|
+
request = args[0]
|
|
235
|
+
observer.update_span_properties = (
|
|
236
|
+
lambda llm_span: set_llm_span_attributes(
|
|
237
|
+
llm_span, args[0], result, llm_prompt
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
observer.result = result
|
|
241
|
+
return result
|
|
242
|
+
|
|
243
|
+
model.request = wrapper
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def instrument(otel: Optional[bool] = False, api_key: Optional[str] = None):
|
|
247
|
+
|
|
248
|
+
if api_key:
|
|
249
|
+
deepeval.login(api_key)
|
|
250
|
+
|
|
251
|
+
api_key = get_confident_api_key()
|
|
252
|
+
|
|
253
|
+
if not api_key:
|
|
254
|
+
raise ValueError("No api key provided.")
|
|
255
|
+
|
|
256
|
+
if otel:
|
|
257
|
+
instrument_pydantic_ai(api_key)
|
|
258
|
+
else:
|
|
259
|
+
with capture_tracing_integration("pydantic_ai"):
|
|
260
|
+
_patch_agent_init()
|
|
261
|
+
_patch_agent_tool_decorator()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def set_llm_span_attributes(
|
|
265
|
+
llm_span: LlmSpan,
|
|
266
|
+
requests: List[ModelRequest],
|
|
267
|
+
result: ModelResponse,
|
|
268
|
+
llm_prompt: Optional[Prompt] = None,
|
|
269
|
+
):
|
|
270
|
+
llm_span.prompt = llm_prompt
|
|
271
|
+
|
|
272
|
+
input = []
|
|
273
|
+
for request in requests:
|
|
274
|
+
for part in request.parts:
|
|
275
|
+
if isinstance(part, SystemPromptPart):
|
|
276
|
+
input.append({"role": "System", "content": part.content})
|
|
277
|
+
elif isinstance(part, UserPromptPart):
|
|
278
|
+
input.append({"role": "User", "content": part.content})
|
|
279
|
+
elif isinstance(part, ToolCallPart):
|
|
280
|
+
input.append(
|
|
281
|
+
{
|
|
282
|
+
"role": "Tool Call",
|
|
283
|
+
"name": part.tool_name,
|
|
284
|
+
"content": part.args_as_json_str(),
|
|
285
|
+
}
|
|
286
|
+
)
|
|
287
|
+
elif isinstance(part, ToolReturnPart):
|
|
288
|
+
input.append(
|
|
289
|
+
{
|
|
290
|
+
"role": "Tool Return",
|
|
291
|
+
"name": part.tool_name,
|
|
292
|
+
"content": part.model_response_str(),
|
|
293
|
+
}
|
|
294
|
+
)
|
|
295
|
+
llm_span.input = input
|
|
296
|
+
|
|
297
|
+
content = ""
|
|
298
|
+
tool_calls = []
|
|
299
|
+
for part in result.parts:
|
|
300
|
+
if isinstance(part, TextPart):
|
|
301
|
+
content += part.content + "\n"
|
|
302
|
+
elif isinstance(part, ToolCallPart):
|
|
303
|
+
tool_calls.append(
|
|
304
|
+
LlmToolCall(name=part.tool_name, args=part.args_as_dict())
|
|
305
|
+
)
|
|
306
|
+
llm_span.output = LlmOutput(
|
|
307
|
+
role="Assistant", content=content, tool_calls=tool_calls
|
|
308
|
+
)
|
|
309
|
+
llm_span.tools_called = _extract_tools_called_from_llm_response(
|
|
310
|
+
result.parts
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
|
|
315
|
+
agent_span.tools_called = _extract_tools_called(result)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# llm tools called
|
|
319
|
+
def _extract_tools_called_from_llm_response(
|
|
320
|
+
result: List[ModelResponsePart],
|
|
321
|
+
) -> List[ToolCall]:
|
|
322
|
+
tool_calls = []
|
|
323
|
+
|
|
324
|
+
# Loop through each ModelResponsePart
|
|
325
|
+
for part in result:
|
|
326
|
+
# Look for parts with part_kind="tool-call"
|
|
327
|
+
if hasattr(part, "part_kind") and part.part_kind == "tool-call":
|
|
328
|
+
# Extract tool name and args from the ToolCallPart
|
|
329
|
+
tool_name = part.tool_name
|
|
330
|
+
input_parameters = (
|
|
331
|
+
part.args_as_dict() if hasattr(part, "args_as_dict") else None
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Create and append ToolCall object
|
|
335
|
+
tool_call = ToolCall(
|
|
336
|
+
name=tool_name, input_parameters=input_parameters
|
|
337
|
+
)
|
|
338
|
+
tool_calls.append(tool_call)
|
|
339
|
+
|
|
340
|
+
return tool_calls
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# TODO: llm tools called (reposne is present next message)
|
|
344
|
+
def _extract_tools_called(result: AgentRunResult) -> List[ToolCall]:
|
|
345
|
+
tool_calls = []
|
|
346
|
+
|
|
347
|
+
# Access the message history from the _state
|
|
348
|
+
message_history = result._state.message_history
|
|
349
|
+
|
|
350
|
+
# Scan through all messages in the history
|
|
351
|
+
for message in message_history:
|
|
352
|
+
# Check if this is a ModelResponse (kind="response")
|
|
353
|
+
if hasattr(message, "kind") and message.kind == "response":
|
|
354
|
+
# For ModelResponse messages, check each part
|
|
355
|
+
if hasattr(message, "parts"):
|
|
356
|
+
for part in message.parts:
|
|
357
|
+
# Look for parts with part_kind="tool-call"
|
|
358
|
+
if (
|
|
359
|
+
hasattr(part, "part_kind")
|
|
360
|
+
and part.part_kind == "tool-call"
|
|
361
|
+
):
|
|
362
|
+
# Extract tool name and args from the ToolCallPart
|
|
363
|
+
tool_name = part.tool_name
|
|
364
|
+
input_parameters = (
|
|
365
|
+
part.args_as_dict()
|
|
366
|
+
if hasattr(part, "args_as_dict")
|
|
367
|
+
else None
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Create and append ToolCall object
|
|
371
|
+
tool_call = ToolCall(
|
|
372
|
+
name=tool_name, input_parameters=input_parameters
|
|
373
|
+
)
|
|
374
|
+
tool_calls.append(tool_call)
|
|
375
|
+
|
|
376
|
+
return tool_calls
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -34,62 +34,37 @@ JSON:
|
|
|
34
34
|
@staticmethod
|
|
35
35
|
def generate_verdicts(input: str, statements: str):
|
|
36
36
|
return f"""For the provided list of statements, determine whether each statement is relevant to address the input.
|
|
37
|
-
|
|
38
|
-
The 'verdict'
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
The provided statements are statements made in the actual output.
|
|
37
|
+
Generate JSON objects with 'verdict' and 'reason' fields.
|
|
38
|
+
The 'verdict' should be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information).
|
|
39
|
+
Provide 'reason' ONLY for 'no' or 'idk' verdicts.
|
|
40
|
+
The statements are from an AI's actual output.
|
|
42
41
|
|
|
43
42
|
**
|
|
44
43
|
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
45
|
-
Example input:
|
|
46
|
-
What features does the new laptop have?
|
|
47
44
|
|
|
48
|
-
|
|
49
|
-
Example statements:
|
|
50
|
-
[
|
|
51
|
-
"The new laptop model has a high-resolution Retina display.",
|
|
52
|
-
"It includes a fast-charging battery with up to 12 hours of usage.",
|
|
53
|
-
"Security features include fingerprint authentication and an encrypted SSD.",
|
|
54
|
-
"Every purchase comes with a one-year warranty.",
|
|
55
|
-
"24/7 customer support is included.",
|
|
56
|
-
"Pineapples taste great on pizza.",
|
|
57
|
-
"The laptop is a Dell XPS 13."
|
|
58
|
-
]
|
|
59
|
-
|
|
60
|
-
Example JSON:
|
|
45
|
+
Expected JSON format:
|
|
61
46
|
{{
|
|
62
47
|
"verdicts": [
|
|
63
48
|
{{
|
|
64
49
|
"verdict": "yes"
|
|
65
50
|
}},
|
|
66
|
-
{{
|
|
67
|
-
"verdict": "yes"
|
|
68
|
-
}},
|
|
69
|
-
{{
|
|
70
|
-
"verdict": "yes"
|
|
71
|
-
}},
|
|
72
|
-
{{
|
|
73
|
-
"verdict": "no",
|
|
74
|
-
"reason": "A one-year warranty is a purchase benefit, not a feature of the laptop itself."
|
|
75
|
-
}},
|
|
76
51
|
{{
|
|
77
52
|
"verdict": "no",
|
|
78
|
-
"reason":
|
|
79
|
-
}},
|
|
80
|
-
{{
|
|
81
|
-
"verdict": "no",
|
|
82
|
-
"reason": "The statement about pineapples on pizza is completely irrelevant to the input, which asks about laptop features."
|
|
53
|
+
"reason": <explanation_for_irrelevance>
|
|
83
54
|
}},
|
|
84
55
|
{{
|
|
85
56
|
"verdict": "idk",
|
|
86
|
-
"reason":
|
|
57
|
+
"reason": <explanation_for_ambiguity>
|
|
87
58
|
}}
|
|
88
59
|
]
|
|
89
60
|
}}
|
|
90
|
-
===== END OF EXAMPLE ======
|
|
91
61
|
|
|
92
|
-
|
|
62
|
+
Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
|
|
63
|
+
'verdict' must be STRICTLY 'yes', 'no', or 'idk':
|
|
64
|
+
- 'yes': statement is relevant to addressing the input
|
|
65
|
+
- 'no': statement is irrelevant to the input
|
|
66
|
+
- 'idk': statement is ambiguous (not directly relevant but could be supporting information)
|
|
67
|
+
Provide 'reason' ONLY for 'no' or 'idk' verdicts.
|
|
93
68
|
**
|
|
94
69
|
|
|
95
70
|
Input:
|