deepeval 3.4.8__py3-none-any.whl → 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -5
- deepeval/_version.py +1 -1
- deepeval/benchmarks/drop/drop.py +2 -3
- deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
- deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
- deepeval/benchmarks/math_qa/math_qa.py +2 -2
- deepeval/benchmarks/mmlu/mmlu.py +2 -2
- deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
- deepeval/cli/main.py +561 -727
- deepeval/confident/api.py +30 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/integrations/pydantic_ai/__init__.py +2 -4
- deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
- deepeval/integrations/pydantic_ai/patcher.py +376 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/kimi_model.py +1 -1
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/perf_epoch_bridge.py +4 -4
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/METADATA +16 -13
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/RECORD +45 -40
- deepeval/env.py +0 -35
- deepeval/integrations/pydantic_ai/agent.py +0 -364
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/WHEEL +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import deepeval
|
|
3
|
+
from deepeval.tracing.types import LlmOutput, LlmToolCall
|
|
4
|
+
from pydantic_ai.agent import AgentRunResult
|
|
5
|
+
from deepeval.tracing.context import current_trace_context
|
|
6
|
+
from deepeval.tracing.types import AgentSpan, LlmSpan
|
|
7
|
+
from deepeval.tracing.tracing import Observer
|
|
8
|
+
from typing import List, Callable, Optional, Any
|
|
9
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
10
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
11
|
+
from deepeval.confident.api import get_confident_api_key
|
|
12
|
+
from deepeval.integrations.pydantic_ai.otel import instrument_pydantic_ai
|
|
13
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
14
|
+
from deepeval.prompt import Prompt
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from pydantic_ai.agent import Agent
|
|
18
|
+
from pydantic_ai.models import Model
|
|
19
|
+
from pydantic_ai.messages import (
|
|
20
|
+
ModelResponse,
|
|
21
|
+
ModelRequest,
|
|
22
|
+
ModelResponsePart,
|
|
23
|
+
TextPart,
|
|
24
|
+
ToolCallPart,
|
|
25
|
+
SystemPromptPart,
|
|
26
|
+
ToolReturnPart,
|
|
27
|
+
UserPromptPart,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
pydantic_ai_installed = True
|
|
31
|
+
except:
|
|
32
|
+
pydantic_ai_installed = True
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _patch_agent_tool_decorator():
|
|
36
|
+
original_tool = Agent.tool
|
|
37
|
+
|
|
38
|
+
@functools.wraps(original_tool)
|
|
39
|
+
def wrapper(
|
|
40
|
+
*args,
|
|
41
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
42
|
+
metric_collection: Optional[str] = None,
|
|
43
|
+
**kwargs
|
|
44
|
+
):
|
|
45
|
+
# Case 1: Direct decoration - @agent.tool
|
|
46
|
+
if args and callable(args[0]):
|
|
47
|
+
patched_func = _create_patched_tool(
|
|
48
|
+
args[0], metrics, metric_collection
|
|
49
|
+
)
|
|
50
|
+
new_args = (patched_func,) + args[1:]
|
|
51
|
+
return original_tool(*new_args, **kwargs)
|
|
52
|
+
|
|
53
|
+
# Case 2: Decoration with arguments - @agent.tool(metrics=..., metric_collection=...)
|
|
54
|
+
else:
|
|
55
|
+
# Return a decorator function that will receive the actual function
|
|
56
|
+
def decorator(func):
|
|
57
|
+
patched_func = _create_patched_tool(
|
|
58
|
+
func, metrics, metric_collection
|
|
59
|
+
)
|
|
60
|
+
return original_tool(*args, **kwargs)(patched_func)
|
|
61
|
+
|
|
62
|
+
return decorator
|
|
63
|
+
|
|
64
|
+
Agent.tool = wrapper
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _create_patched_tool(
|
|
68
|
+
func: Callable,
|
|
69
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
70
|
+
metric_collection: Optional[str] = None,
|
|
71
|
+
):
|
|
72
|
+
import asyncio
|
|
73
|
+
|
|
74
|
+
original_func = func
|
|
75
|
+
|
|
76
|
+
is_async = asyncio.iscoroutinefunction(original_func)
|
|
77
|
+
|
|
78
|
+
if is_async:
|
|
79
|
+
|
|
80
|
+
@functools.wraps(original_func)
|
|
81
|
+
async def async_wrapper(*args, **kwargs):
|
|
82
|
+
with Observer(
|
|
83
|
+
span_type="tool",
|
|
84
|
+
func_name=original_func.__name__,
|
|
85
|
+
metrics=metrics,
|
|
86
|
+
metric_collection=metric_collection,
|
|
87
|
+
function_kwargs={"args": args, **kwargs},
|
|
88
|
+
) as observer:
|
|
89
|
+
result = await original_func(*args, **kwargs)
|
|
90
|
+
observer.result = result
|
|
91
|
+
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
return async_wrapper
|
|
95
|
+
else:
|
|
96
|
+
|
|
97
|
+
@functools.wraps(original_func)
|
|
98
|
+
def sync_wrapper(*args, **kwargs):
|
|
99
|
+
with Observer(
|
|
100
|
+
span_type="tool",
|
|
101
|
+
func_name=original_func.__name__,
|
|
102
|
+
metrics=metrics,
|
|
103
|
+
metric_collection=metric_collection,
|
|
104
|
+
function_kwargs={"args": args, **kwargs},
|
|
105
|
+
) as observer:
|
|
106
|
+
result = original_func(*args, **kwargs)
|
|
107
|
+
observer.result = result
|
|
108
|
+
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
return sync_wrapper
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _patch_agent_init():
|
|
115
|
+
original_init = Agent.__init__
|
|
116
|
+
|
|
117
|
+
@functools.wraps(original_init)
|
|
118
|
+
def wrapper(
|
|
119
|
+
self,
|
|
120
|
+
*args,
|
|
121
|
+
llm_metric_collection: Optional[str] = None,
|
|
122
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
123
|
+
llm_prompt: Optional[Prompt] = None,
|
|
124
|
+
agent_metric_collection: Optional[str] = None,
|
|
125
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
126
|
+
**kwargs
|
|
127
|
+
):
|
|
128
|
+
result = original_init(self, *args, **kwargs)
|
|
129
|
+
_patch_llm_model(
|
|
130
|
+
self._model, llm_metric_collection, llm_metrics, llm_prompt
|
|
131
|
+
) # runtime patch of the model
|
|
132
|
+
_patch_agent_run(agent_metric_collection, agent_metrics)
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
Agent.__init__ = wrapper
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _patch_agent_run(
|
|
139
|
+
agent_metric_collection: Optional[str] = None,
|
|
140
|
+
agent_metrics: Optional[List[BaseMetric]] = None,
|
|
141
|
+
):
|
|
142
|
+
original_run = Agent.run
|
|
143
|
+
|
|
144
|
+
@functools.wraps(original_run)
|
|
145
|
+
async def wrapper(
|
|
146
|
+
*args,
|
|
147
|
+
trace_metric_collection: Optional[str] = None,
|
|
148
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
149
|
+
trace_name: Optional[str] = None,
|
|
150
|
+
trace_tags: Optional[List[str]] = None,
|
|
151
|
+
trace_metadata: Optional[dict] = None,
|
|
152
|
+
trace_thread_id: Optional[str] = None,
|
|
153
|
+
trace_user_id: Optional[str] = None,
|
|
154
|
+
**kwargs
|
|
155
|
+
):
|
|
156
|
+
with Observer(
|
|
157
|
+
span_type="agent",
|
|
158
|
+
func_name="Agent",
|
|
159
|
+
function_kwargs={"input": args[1]},
|
|
160
|
+
metrics=agent_metrics,
|
|
161
|
+
metric_collection=agent_metric_collection,
|
|
162
|
+
) as observer:
|
|
163
|
+
result = await original_run(*args, **kwargs)
|
|
164
|
+
observer.update_span_properties = (
|
|
165
|
+
lambda agent_span: set_agent_span_attributes(agent_span, result)
|
|
166
|
+
)
|
|
167
|
+
observer.result = result.output
|
|
168
|
+
|
|
169
|
+
_update_trace_context(
|
|
170
|
+
trace_name=trace_name,
|
|
171
|
+
trace_tags=trace_tags,
|
|
172
|
+
trace_metadata=trace_metadata,
|
|
173
|
+
trace_thread_id=trace_thread_id,
|
|
174
|
+
trace_user_id=trace_user_id,
|
|
175
|
+
trace_metric_collection=trace_metric_collection,
|
|
176
|
+
trace_metrics=trace_metrics,
|
|
177
|
+
trace_input=args[1],
|
|
178
|
+
trace_output=result.output,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
Agent.run = wrapper
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _update_trace_context(
|
|
187
|
+
trace_name: Optional[str] = None,
|
|
188
|
+
trace_tags: Optional[List[str]] = None,
|
|
189
|
+
trace_metadata: Optional[dict] = None,
|
|
190
|
+
trace_thread_id: Optional[str] = None,
|
|
191
|
+
trace_user_id: Optional[str] = None,
|
|
192
|
+
trace_metric_collection: Optional[str] = None,
|
|
193
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
194
|
+
trace_input: Optional[Any] = None,
|
|
195
|
+
trace_output: Optional[Any] = None,
|
|
196
|
+
):
|
|
197
|
+
|
|
198
|
+
current_trace = current_trace_context.get()
|
|
199
|
+
current_trace.name = trace_name
|
|
200
|
+
current_trace.tags = trace_tags
|
|
201
|
+
current_trace.metadata = trace_metadata
|
|
202
|
+
current_trace.thread_id = trace_thread_id
|
|
203
|
+
current_trace.user_id = trace_user_id
|
|
204
|
+
current_trace.metric_collection = trace_metric_collection
|
|
205
|
+
current_trace.metrics = trace_metrics
|
|
206
|
+
current_trace.input = trace_input
|
|
207
|
+
current_trace.output = trace_output
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _patch_llm_model(
|
|
211
|
+
model: Model,
|
|
212
|
+
llm_metric_collection: Optional[str] = None,
|
|
213
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
214
|
+
llm_prompt: Optional[Prompt] = None,
|
|
215
|
+
):
|
|
216
|
+
original_func = model.request
|
|
217
|
+
try:
|
|
218
|
+
model_name = model.model_name
|
|
219
|
+
except Exception:
|
|
220
|
+
model_name = "unknown"
|
|
221
|
+
|
|
222
|
+
@functools.wraps(original_func)
|
|
223
|
+
async def wrapper(*args, **kwargs):
|
|
224
|
+
with Observer(
|
|
225
|
+
span_type="llm",
|
|
226
|
+
func_name="LLM",
|
|
227
|
+
observe_kwargs={"model": model_name},
|
|
228
|
+
metrics=llm_metrics,
|
|
229
|
+
metric_collection=llm_metric_collection,
|
|
230
|
+
) as observer:
|
|
231
|
+
result = await original_func(*args, **kwargs)
|
|
232
|
+
request = kwargs.get("messages", [])
|
|
233
|
+
if not request:
|
|
234
|
+
request = args[0]
|
|
235
|
+
observer.update_span_properties = (
|
|
236
|
+
lambda llm_span: set_llm_span_attributes(
|
|
237
|
+
llm_span, args[0], result, llm_prompt
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
observer.result = result
|
|
241
|
+
return result
|
|
242
|
+
|
|
243
|
+
model.request = wrapper
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def instrument(otel: Optional[bool] = False, api_key: Optional[str] = None):
|
|
247
|
+
|
|
248
|
+
if api_key:
|
|
249
|
+
deepeval.login(api_key)
|
|
250
|
+
|
|
251
|
+
api_key = get_confident_api_key()
|
|
252
|
+
|
|
253
|
+
if not api_key:
|
|
254
|
+
raise ValueError("No api key provided.")
|
|
255
|
+
|
|
256
|
+
if otel:
|
|
257
|
+
instrument_pydantic_ai(api_key)
|
|
258
|
+
else:
|
|
259
|
+
with capture_tracing_integration("pydantic_ai"):
|
|
260
|
+
_patch_agent_init()
|
|
261
|
+
_patch_agent_tool_decorator()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def set_llm_span_attributes(
|
|
265
|
+
llm_span: LlmSpan,
|
|
266
|
+
requests: List[ModelRequest],
|
|
267
|
+
result: ModelResponse,
|
|
268
|
+
llm_prompt: Optional[Prompt] = None,
|
|
269
|
+
):
|
|
270
|
+
llm_span.prompt = llm_prompt
|
|
271
|
+
|
|
272
|
+
input = []
|
|
273
|
+
for request in requests:
|
|
274
|
+
for part in request.parts:
|
|
275
|
+
if isinstance(part, SystemPromptPart):
|
|
276
|
+
input.append({"role": "System", "content": part.content})
|
|
277
|
+
elif isinstance(part, UserPromptPart):
|
|
278
|
+
input.append({"role": "User", "content": part.content})
|
|
279
|
+
elif isinstance(part, ToolCallPart):
|
|
280
|
+
input.append(
|
|
281
|
+
{
|
|
282
|
+
"role": "Tool Call",
|
|
283
|
+
"name": part.tool_name,
|
|
284
|
+
"content": part.args_as_json_str(),
|
|
285
|
+
}
|
|
286
|
+
)
|
|
287
|
+
elif isinstance(part, ToolReturnPart):
|
|
288
|
+
input.append(
|
|
289
|
+
{
|
|
290
|
+
"role": "Tool Return",
|
|
291
|
+
"name": part.tool_name,
|
|
292
|
+
"content": part.model_response_str(),
|
|
293
|
+
}
|
|
294
|
+
)
|
|
295
|
+
llm_span.input = input
|
|
296
|
+
|
|
297
|
+
content = ""
|
|
298
|
+
tool_calls = []
|
|
299
|
+
for part in result.parts:
|
|
300
|
+
if isinstance(part, TextPart):
|
|
301
|
+
content += part.content + "\n"
|
|
302
|
+
elif isinstance(part, ToolCallPart):
|
|
303
|
+
tool_calls.append(
|
|
304
|
+
LlmToolCall(name=part.tool_name, args=part.args_as_dict())
|
|
305
|
+
)
|
|
306
|
+
llm_span.output = LlmOutput(
|
|
307
|
+
role="Assistant", content=content, tool_calls=tool_calls
|
|
308
|
+
)
|
|
309
|
+
llm_span.tools_called = _extract_tools_called_from_llm_response(
|
|
310
|
+
result.parts
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
|
|
315
|
+
agent_span.tools_called = _extract_tools_called(result)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# llm tools called
|
|
319
|
+
def _extract_tools_called_from_llm_response(
|
|
320
|
+
result: List[ModelResponsePart],
|
|
321
|
+
) -> List[ToolCall]:
|
|
322
|
+
tool_calls = []
|
|
323
|
+
|
|
324
|
+
# Loop through each ModelResponsePart
|
|
325
|
+
for part in result:
|
|
326
|
+
# Look for parts with part_kind="tool-call"
|
|
327
|
+
if hasattr(part, "part_kind") and part.part_kind == "tool-call":
|
|
328
|
+
# Extract tool name and args from the ToolCallPart
|
|
329
|
+
tool_name = part.tool_name
|
|
330
|
+
input_parameters = (
|
|
331
|
+
part.args_as_dict() if hasattr(part, "args_as_dict") else None
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Create and append ToolCall object
|
|
335
|
+
tool_call = ToolCall(
|
|
336
|
+
name=tool_name, input_parameters=input_parameters
|
|
337
|
+
)
|
|
338
|
+
tool_calls.append(tool_call)
|
|
339
|
+
|
|
340
|
+
return tool_calls
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# TODO: llm tools called (reposne is present next message)
|
|
344
|
+
def _extract_tools_called(result: AgentRunResult) -> List[ToolCall]:
|
|
345
|
+
tool_calls = []
|
|
346
|
+
|
|
347
|
+
# Access the message history from the _state
|
|
348
|
+
message_history = result._state.message_history
|
|
349
|
+
|
|
350
|
+
# Scan through all messages in the history
|
|
351
|
+
for message in message_history:
|
|
352
|
+
# Check if this is a ModelResponse (kind="response")
|
|
353
|
+
if hasattr(message, "kind") and message.kind == "response":
|
|
354
|
+
# For ModelResponse messages, check each part
|
|
355
|
+
if hasattr(message, "parts"):
|
|
356
|
+
for part in message.parts:
|
|
357
|
+
# Look for parts with part_kind="tool-call"
|
|
358
|
+
if (
|
|
359
|
+
hasattr(part, "part_kind")
|
|
360
|
+
and part.part_kind == "tool-call"
|
|
361
|
+
):
|
|
362
|
+
# Extract tool name and args from the ToolCallPart
|
|
363
|
+
tool_name = part.tool_name
|
|
364
|
+
input_parameters = (
|
|
365
|
+
part.args_as_dict()
|
|
366
|
+
if hasattr(part, "args_as_dict")
|
|
367
|
+
else None
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Create and append ToolCall object
|
|
371
|
+
tool_call = ToolCall(
|
|
372
|
+
name=tool_name, input_parameters=input_parameters
|
|
373
|
+
)
|
|
374
|
+
tool_calls.append(tool_call)
|
|
375
|
+
|
|
376
|
+
return tool_calls
|
deepeval/key_handler.py
CHANGED
|
@@ -80,6 +80,7 @@ class ModelKeyValues(Enum):
|
|
|
80
80
|
OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
|
|
81
81
|
OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
|
|
82
82
|
OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
|
|
83
|
+
OPENAI_API_KEY = "OPENAI_API_KEY"
|
|
83
84
|
# Moonshot
|
|
84
85
|
USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
|
|
85
86
|
MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
|
|
@@ -37,7 +37,7 @@ JSON:
|
|
|
37
37
|
Please generate a list of JSON with two keys: `verdict` and `reason`.
|
|
38
38
|
The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement is relevant to addressing the original input, 'no' if the statement is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
|
|
39
39
|
The 'reason' is the reason for the verdict.
|
|
40
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
40
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
41
41
|
The provided statements are statements made in the actual output.
|
|
42
42
|
|
|
43
43
|
**
|
|
@@ -53,7 +53,8 @@ Example statements:
|
|
|
53
53
|
"Security features include fingerprint authentication and an encrypted SSD.",
|
|
54
54
|
"Every purchase comes with a one-year warranty.",
|
|
55
55
|
"24/7 customer support is included.",
|
|
56
|
-
"Pineapples taste great on pizza."
|
|
56
|
+
"Pineapples taste great on pizza.",
|
|
57
|
+
"The laptop is a Dell XPS 13."
|
|
57
58
|
]
|
|
58
59
|
|
|
59
60
|
Example JSON:
|
|
@@ -79,6 +80,10 @@ Example JSON:
|
|
|
79
80
|
{{
|
|
80
81
|
"verdict": "no",
|
|
81
82
|
"reason": "The statement about pineapples on pizza is completely irrelevant to the input, which asks about laptop features."
|
|
83
|
+
}},
|
|
84
|
+
{{
|
|
85
|
+
"verdict": "idk",
|
|
86
|
+
"reason": "The statement about the laptop being a Dell XPS 13 is not directly relevant to the input, but could be used as a supporting point to address the input."
|
|
82
87
|
}}
|
|
83
88
|
]
|
|
84
89
|
}}
|
|
@@ -4,7 +4,7 @@ from typing import Optional, List
|
|
|
4
4
|
class FaithfulnessTemplate:
|
|
5
5
|
@staticmethod
|
|
6
6
|
def generate_claims(actual_output: str):
|
|
7
|
-
return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided
|
|
7
|
+
return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output.
|
|
8
8
|
These truths, MUST BE COHERENT, and CANNOT be taken out of context.
|
|
9
9
|
|
|
10
10
|
Example:
|
|
@@ -24,9 +24,10 @@ Example JSON:
|
|
|
24
24
|
IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
|
|
25
25
|
Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
|
|
26
26
|
You should NOT include any prior knowledge, and take the text at face value when extracting claims.
|
|
27
|
+
You should be aware that it is an AI that is outputting these claims.
|
|
27
28
|
**
|
|
28
29
|
|
|
29
|
-
|
|
30
|
+
AI Output:
|
|
30
31
|
{actual_output}
|
|
31
32
|
|
|
32
33
|
JSON:
|
|
@@ -72,7 +73,7 @@ JSON:
|
|
|
72
73
|
def generate_verdicts(claims: List[str], retrieval_context: str):
|
|
73
74
|
return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
74
75
|
The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
|
|
75
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
76
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
76
77
|
The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
|
|
77
78
|
|
|
78
79
|
**
|
|
@@ -84,28 +85,30 @@ Example:
|
|
|
84
85
|
{{
|
|
85
86
|
"verdicts": [
|
|
86
87
|
{{
|
|
87
|
-
"verdict": "idk"
|
|
88
|
+
"verdict": "idk",
|
|
89
|
+
"reason": "The claim about Barack Obama is although incorrect, it is not directly addressed in the retrieval context, and so poses no contradiction."
|
|
88
90
|
}},
|
|
89
91
|
{{
|
|
90
|
-
"verdict": "idk"
|
|
92
|
+
"verdict": "idk",
|
|
93
|
+
"reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
|
|
91
94
|
}},
|
|
92
95
|
{{
|
|
93
96
|
"verdict": "yes"
|
|
94
97
|
}},
|
|
95
98
|
{{
|
|
96
99
|
"verdict": "no",
|
|
97
|
-
"reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
|
|
100
|
+
"reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead. This contradicts the retrieval context."
|
|
98
101
|
}},
|
|
99
102
|
{{
|
|
100
103
|
"verdict": "no",
|
|
101
|
-
"reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead."
|
|
104
|
+
"reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead. This contradicts the retrieval context."
|
|
102
105
|
}},
|
|
103
106
|
]
|
|
104
107
|
}}
|
|
105
108
|
===== END OF EXAMPLE ======
|
|
106
109
|
|
|
107
110
|
The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
|
|
108
|
-
You DON'T have to provide a reason if the answer is 'yes'
|
|
111
|
+
You DON'T have to provide a reason if the answer is 'yes'.
|
|
109
112
|
ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
|
|
110
113
|
Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
|
|
111
114
|
Claims that are not backed up by the retrieval context or are not mentioned in it MUST be answered 'idk'.
|
|
@@ -39,7 +39,7 @@ class MultimodalAnswerRelevancyTemplate:
|
|
|
39
39
|
Please generate a list of JSON with two keys: `verdict` and `reason`.
|
|
40
40
|
The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement or image is relevant to addressing the original input, 'no' if the statement or image is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
|
|
41
41
|
The 'reason' is the reason for the verdict.
|
|
42
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
42
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
43
43
|
The provided statements are statements and images generated in the actual output.
|
|
44
44
|
|
|
45
45
|
**
|
|
@@ -54,13 +54,15 @@ class MultimodalAnswerRelevancyTemplate:
|
|
|
54
54
|
"reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake."
|
|
55
55
|
}},
|
|
56
56
|
{{
|
|
57
|
-
"verdict": "idk"
|
|
57
|
+
"verdict": "idk",
|
|
58
|
+
"reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant."
|
|
58
59
|
}},
|
|
59
60
|
{{
|
|
60
|
-
"verdict": "idk"
|
|
61
|
+
"verdict": "idk",
|
|
62
|
+
"reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant."
|
|
61
63
|
}},
|
|
62
64
|
{{
|
|
63
|
-
"verdict": "yes"
|
|
65
|
+
"verdict": "yes",
|
|
64
66
|
}}
|
|
65
67
|
]
|
|
66
68
|
}}
|
|
@@ -95,7 +95,7 @@ class MultimodalFaithfulnessTemplate:
|
|
|
95
95
|
return textwrap.dedent(
|
|
96
96
|
f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
97
97
|
The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
|
|
98
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
98
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
99
99
|
The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
|
|
100
100
|
|
|
101
101
|
**
|
|
@@ -107,10 +107,12 @@ class MultimodalFaithfulnessTemplate:
|
|
|
107
107
|
{{
|
|
108
108
|
"verdicts": [
|
|
109
109
|
{{
|
|
110
|
-
"verdict": "idk"
|
|
110
|
+
"verdict": "idk",
|
|
111
|
+
"reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction."
|
|
111
112
|
}},
|
|
112
113
|
{{
|
|
113
|
-
"verdict": "idk"
|
|
114
|
+
"verdict": "idk",
|
|
115
|
+
"reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
|
|
114
116
|
}},
|
|
115
117
|
{{
|
|
116
118
|
"verdict": "yes"
|
|
@@ -128,7 +130,7 @@ class MultimodalFaithfulnessTemplate:
|
|
|
128
130
|
===== END OF EXAMPLE ======
|
|
129
131
|
|
|
130
132
|
The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
|
|
131
|
-
You DON'T have to provide a reason if the answer is 'yes'
|
|
133
|
+
You DON'T have to provide a reason if the answer is 'yes'.
|
|
132
134
|
ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
|
|
133
135
|
Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
|
|
134
136
|
Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
|
|
@@ -223,9 +223,13 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
223
223
|
total_score += best_score
|
|
224
224
|
matched_called_tools.add(best_called_tool)
|
|
225
225
|
return (
|
|
226
|
-
|
|
227
|
-
if self.expected_tools
|
|
228
|
-
else
|
|
226
|
+
1.0
|
|
227
|
+
if not self.expected_tools and not self.tools_called
|
|
228
|
+
else (
|
|
229
|
+
0.0
|
|
230
|
+
if not self.expected_tools
|
|
231
|
+
else total_score / len(self.expected_tools)
|
|
232
|
+
)
|
|
229
233
|
)
|
|
230
234
|
|
|
231
235
|
# Consider ordering score
|
|
@@ -115,13 +115,34 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
115
115
|
###############################################
|
|
116
116
|
|
|
117
117
|
def get_converse_request_body(self, prompt: str) -> dict:
|
|
118
|
+
# Inline parameter translation with defaults
|
|
119
|
+
param_mapping = {
|
|
120
|
+
"max_tokens": "maxTokens",
|
|
121
|
+
"top_p": "topP",
|
|
122
|
+
"top_k": "topK",
|
|
123
|
+
"stop_sequences": "stopSequences",
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Start with defaults for required parameters
|
|
127
|
+
translated_kwargs = {
|
|
128
|
+
"maxTokens": self.generation_kwargs.get("max_tokens", 1000),
|
|
129
|
+
"topP": self.generation_kwargs.get("top_p", 0),
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Add any other parameters from generation_kwargs
|
|
133
|
+
for key, value in self.generation_kwargs.items():
|
|
134
|
+
if key not in [
|
|
135
|
+
"max_tokens",
|
|
136
|
+
"top_p",
|
|
137
|
+
]: # Skip already handled defaults
|
|
138
|
+
aws_key = param_mapping.get(key, key)
|
|
139
|
+
translated_kwargs[aws_key] = value
|
|
140
|
+
|
|
118
141
|
return {
|
|
119
142
|
"messages": [{"role": "user", "content": [{"text": prompt}]}],
|
|
120
143
|
"inferenceConfig": {
|
|
121
144
|
"temperature": self.temperature,
|
|
122
|
-
|
|
123
|
-
"maxTokens": self.generation_kwargs.get("max_tokens", 1000),
|
|
124
|
-
**self.generation_kwargs,
|
|
145
|
+
**translated_kwargs,
|
|
125
146
|
},
|
|
126
147
|
}
|
|
127
148
|
|