deepeval 3.4.8__py3-none-any.whl → 3.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deepeval/__init__.py +8 -5
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/drop/drop.py +2 -3
  4. deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
  5. deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
  6. deepeval/benchmarks/math_qa/math_qa.py +2 -2
  7. deepeval/benchmarks/mmlu/mmlu.py +2 -2
  8. deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
  9. deepeval/cli/main.py +561 -727
  10. deepeval/confident/api.py +30 -14
  11. deepeval/config/__init__.py +0 -0
  12. deepeval/config/settings.py +565 -0
  13. deepeval/config/settings_manager.py +133 -0
  14. deepeval/config/utils.py +86 -0
  15. deepeval/dataset/__init__.py +1 -0
  16. deepeval/dataset/dataset.py +70 -10
  17. deepeval/dataset/test_run_tracer.py +82 -0
  18. deepeval/dataset/utils.py +23 -0
  19. deepeval/integrations/pydantic_ai/__init__.py +2 -4
  20. deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
  21. deepeval/integrations/pydantic_ai/patcher.py +376 -0
  22. deepeval/key_handler.py +1 -0
  23. deepeval/metrics/answer_relevancy/template.py +7 -2
  24. deepeval/metrics/faithfulness/template.py +11 -8
  25. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
  26. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
  27. deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
  28. deepeval/models/llms/amazon_bedrock_model.py +24 -3
  29. deepeval/models/llms/grok_model.py +1 -1
  30. deepeval/models/llms/kimi_model.py +1 -1
  31. deepeval/models/llms/openai_model.py +37 -41
  32. deepeval/models/retry_policy.py +280 -0
  33. deepeval/openai_agents/agent.py +4 -2
  34. deepeval/test_run/api.py +1 -0
  35. deepeval/tracing/otel/exporter.py +20 -8
  36. deepeval/tracing/otel/utils.py +57 -0
  37. deepeval/tracing/perf_epoch_bridge.py +4 -4
  38. deepeval/tracing/tracing.py +37 -16
  39. deepeval/tracing/utils.py +98 -1
  40. deepeval/utils.py +111 -70
  41. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/METADATA +16 -13
  42. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/RECORD +45 -40
  43. deepeval/env.py +0 -35
  44. deepeval/integrations/pydantic_ai/agent.py +0 -364
  45. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/LICENSE.md +0 -0
  46. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/WHEEL +0 -0
  47. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,376 @@
1
+ import functools
2
+ import deepeval
3
+ from deepeval.tracing.types import LlmOutput, LlmToolCall
4
+ from pydantic_ai.agent import AgentRunResult
5
+ from deepeval.tracing.context import current_trace_context
6
+ from deepeval.tracing.types import AgentSpan, LlmSpan
7
+ from deepeval.tracing.tracing import Observer
8
+ from typing import List, Callable, Optional, Any
9
+ from deepeval.test_case.llm_test_case import ToolCall
10
+ from deepeval.metrics.base_metric import BaseMetric
11
+ from deepeval.confident.api import get_confident_api_key
12
+ from deepeval.integrations.pydantic_ai.otel import instrument_pydantic_ai
13
+ from deepeval.telemetry import capture_tracing_integration
14
+ from deepeval.prompt import Prompt
15
+
16
+ try:
17
+ from pydantic_ai.agent import Agent
18
+ from pydantic_ai.models import Model
19
+ from pydantic_ai.messages import (
20
+ ModelResponse,
21
+ ModelRequest,
22
+ ModelResponsePart,
23
+ TextPart,
24
+ ToolCallPart,
25
+ SystemPromptPart,
26
+ ToolReturnPart,
27
+ UserPromptPart,
28
+ )
29
+
30
+ pydantic_ai_installed = True
31
+ except:
32
+ pydantic_ai_installed = True
33
+
34
+
35
+ def _patch_agent_tool_decorator():
36
+ original_tool = Agent.tool
37
+
38
+ @functools.wraps(original_tool)
39
+ def wrapper(
40
+ *args,
41
+ metrics: Optional[List[BaseMetric]] = None,
42
+ metric_collection: Optional[str] = None,
43
+ **kwargs
44
+ ):
45
+ # Case 1: Direct decoration - @agent.tool
46
+ if args and callable(args[0]):
47
+ patched_func = _create_patched_tool(
48
+ args[0], metrics, metric_collection
49
+ )
50
+ new_args = (patched_func,) + args[1:]
51
+ return original_tool(*new_args, **kwargs)
52
+
53
+ # Case 2: Decoration with arguments - @agent.tool(metrics=..., metric_collection=...)
54
+ else:
55
+ # Return a decorator function that will receive the actual function
56
+ def decorator(func):
57
+ patched_func = _create_patched_tool(
58
+ func, metrics, metric_collection
59
+ )
60
+ return original_tool(*args, **kwargs)(patched_func)
61
+
62
+ return decorator
63
+
64
+ Agent.tool = wrapper
65
+
66
+
67
+ def _create_patched_tool(
68
+ func: Callable,
69
+ metrics: Optional[List[BaseMetric]] = None,
70
+ metric_collection: Optional[str] = None,
71
+ ):
72
+ import asyncio
73
+
74
+ original_func = func
75
+
76
+ is_async = asyncio.iscoroutinefunction(original_func)
77
+
78
+ if is_async:
79
+
80
+ @functools.wraps(original_func)
81
+ async def async_wrapper(*args, **kwargs):
82
+ with Observer(
83
+ span_type="tool",
84
+ func_name=original_func.__name__,
85
+ metrics=metrics,
86
+ metric_collection=metric_collection,
87
+ function_kwargs={"args": args, **kwargs},
88
+ ) as observer:
89
+ result = await original_func(*args, **kwargs)
90
+ observer.result = result
91
+
92
+ return result
93
+
94
+ return async_wrapper
95
+ else:
96
+
97
+ @functools.wraps(original_func)
98
+ def sync_wrapper(*args, **kwargs):
99
+ with Observer(
100
+ span_type="tool",
101
+ func_name=original_func.__name__,
102
+ metrics=metrics,
103
+ metric_collection=metric_collection,
104
+ function_kwargs={"args": args, **kwargs},
105
+ ) as observer:
106
+ result = original_func(*args, **kwargs)
107
+ observer.result = result
108
+
109
+ return result
110
+
111
+ return sync_wrapper
112
+
113
+
114
+ def _patch_agent_init():
115
+ original_init = Agent.__init__
116
+
117
+ @functools.wraps(original_init)
118
+ def wrapper(
119
+ self,
120
+ *args,
121
+ llm_metric_collection: Optional[str] = None,
122
+ llm_metrics: Optional[List[BaseMetric]] = None,
123
+ llm_prompt: Optional[Prompt] = None,
124
+ agent_metric_collection: Optional[str] = None,
125
+ agent_metrics: Optional[List[BaseMetric]] = None,
126
+ **kwargs
127
+ ):
128
+ result = original_init(self, *args, **kwargs)
129
+ _patch_llm_model(
130
+ self._model, llm_metric_collection, llm_metrics, llm_prompt
131
+ ) # runtime patch of the model
132
+ _patch_agent_run(agent_metric_collection, agent_metrics)
133
+ return result
134
+
135
+ Agent.__init__ = wrapper
136
+
137
+
138
+ def _patch_agent_run(
139
+ agent_metric_collection: Optional[str] = None,
140
+ agent_metrics: Optional[List[BaseMetric]] = None,
141
+ ):
142
+ original_run = Agent.run
143
+
144
+ @functools.wraps(original_run)
145
+ async def wrapper(
146
+ *args,
147
+ trace_metric_collection: Optional[str] = None,
148
+ trace_metrics: Optional[List[BaseMetric]] = None,
149
+ trace_name: Optional[str] = None,
150
+ trace_tags: Optional[List[str]] = None,
151
+ trace_metadata: Optional[dict] = None,
152
+ trace_thread_id: Optional[str] = None,
153
+ trace_user_id: Optional[str] = None,
154
+ **kwargs
155
+ ):
156
+ with Observer(
157
+ span_type="agent",
158
+ func_name="Agent",
159
+ function_kwargs={"input": args[1]},
160
+ metrics=agent_metrics,
161
+ metric_collection=agent_metric_collection,
162
+ ) as observer:
163
+ result = await original_run(*args, **kwargs)
164
+ observer.update_span_properties = (
165
+ lambda agent_span: set_agent_span_attributes(agent_span, result)
166
+ )
167
+ observer.result = result.output
168
+
169
+ _update_trace_context(
170
+ trace_name=trace_name,
171
+ trace_tags=trace_tags,
172
+ trace_metadata=trace_metadata,
173
+ trace_thread_id=trace_thread_id,
174
+ trace_user_id=trace_user_id,
175
+ trace_metric_collection=trace_metric_collection,
176
+ trace_metrics=trace_metrics,
177
+ trace_input=args[1],
178
+ trace_output=result.output,
179
+ )
180
+
181
+ return result
182
+
183
+ Agent.run = wrapper
184
+
185
+
186
+ def _update_trace_context(
187
+ trace_name: Optional[str] = None,
188
+ trace_tags: Optional[List[str]] = None,
189
+ trace_metadata: Optional[dict] = None,
190
+ trace_thread_id: Optional[str] = None,
191
+ trace_user_id: Optional[str] = None,
192
+ trace_metric_collection: Optional[str] = None,
193
+ trace_metrics: Optional[List[BaseMetric]] = None,
194
+ trace_input: Optional[Any] = None,
195
+ trace_output: Optional[Any] = None,
196
+ ):
197
+
198
+ current_trace = current_trace_context.get()
199
+ current_trace.name = trace_name
200
+ current_trace.tags = trace_tags
201
+ current_trace.metadata = trace_metadata
202
+ current_trace.thread_id = trace_thread_id
203
+ current_trace.user_id = trace_user_id
204
+ current_trace.metric_collection = trace_metric_collection
205
+ current_trace.metrics = trace_metrics
206
+ current_trace.input = trace_input
207
+ current_trace.output = trace_output
208
+
209
+
210
+ def _patch_llm_model(
211
+ model: Model,
212
+ llm_metric_collection: Optional[str] = None,
213
+ llm_metrics: Optional[List[BaseMetric]] = None,
214
+ llm_prompt: Optional[Prompt] = None,
215
+ ):
216
+ original_func = model.request
217
+ try:
218
+ model_name = model.model_name
219
+ except Exception:
220
+ model_name = "unknown"
221
+
222
+ @functools.wraps(original_func)
223
+ async def wrapper(*args, **kwargs):
224
+ with Observer(
225
+ span_type="llm",
226
+ func_name="LLM",
227
+ observe_kwargs={"model": model_name},
228
+ metrics=llm_metrics,
229
+ metric_collection=llm_metric_collection,
230
+ ) as observer:
231
+ result = await original_func(*args, **kwargs)
232
+ request = kwargs.get("messages", [])
233
+ if not request:
234
+ request = args[0]
235
+ observer.update_span_properties = (
236
+ lambda llm_span: set_llm_span_attributes(
237
+ llm_span, args[0], result, llm_prompt
238
+ )
239
+ )
240
+ observer.result = result
241
+ return result
242
+
243
+ model.request = wrapper
244
+
245
+
246
+ def instrument(otel: Optional[bool] = False, api_key: Optional[str] = None):
247
+
248
+ if api_key:
249
+ deepeval.login(api_key)
250
+
251
+ api_key = get_confident_api_key()
252
+
253
+ if not api_key:
254
+ raise ValueError("No api key provided.")
255
+
256
+ if otel:
257
+ instrument_pydantic_ai(api_key)
258
+ else:
259
+ with capture_tracing_integration("pydantic_ai"):
260
+ _patch_agent_init()
261
+ _patch_agent_tool_decorator()
262
+
263
+
264
+ def set_llm_span_attributes(
265
+ llm_span: LlmSpan,
266
+ requests: List[ModelRequest],
267
+ result: ModelResponse,
268
+ llm_prompt: Optional[Prompt] = None,
269
+ ):
270
+ llm_span.prompt = llm_prompt
271
+
272
+ input = []
273
+ for request in requests:
274
+ for part in request.parts:
275
+ if isinstance(part, SystemPromptPart):
276
+ input.append({"role": "System", "content": part.content})
277
+ elif isinstance(part, UserPromptPart):
278
+ input.append({"role": "User", "content": part.content})
279
+ elif isinstance(part, ToolCallPart):
280
+ input.append(
281
+ {
282
+ "role": "Tool Call",
283
+ "name": part.tool_name,
284
+ "content": part.args_as_json_str(),
285
+ }
286
+ )
287
+ elif isinstance(part, ToolReturnPart):
288
+ input.append(
289
+ {
290
+ "role": "Tool Return",
291
+ "name": part.tool_name,
292
+ "content": part.model_response_str(),
293
+ }
294
+ )
295
+ llm_span.input = input
296
+
297
+ content = ""
298
+ tool_calls = []
299
+ for part in result.parts:
300
+ if isinstance(part, TextPart):
301
+ content += part.content + "\n"
302
+ elif isinstance(part, ToolCallPart):
303
+ tool_calls.append(
304
+ LlmToolCall(name=part.tool_name, args=part.args_as_dict())
305
+ )
306
+ llm_span.output = LlmOutput(
307
+ role="Assistant", content=content, tool_calls=tool_calls
308
+ )
309
+ llm_span.tools_called = _extract_tools_called_from_llm_response(
310
+ result.parts
311
+ )
312
+
313
+
314
+ def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
315
+ agent_span.tools_called = _extract_tools_called(result)
316
+
317
+
318
+ # llm tools called
319
+ def _extract_tools_called_from_llm_response(
320
+ result: List[ModelResponsePart],
321
+ ) -> List[ToolCall]:
322
+ tool_calls = []
323
+
324
+ # Loop through each ModelResponsePart
325
+ for part in result:
326
+ # Look for parts with part_kind="tool-call"
327
+ if hasattr(part, "part_kind") and part.part_kind == "tool-call":
328
+ # Extract tool name and args from the ToolCallPart
329
+ tool_name = part.tool_name
330
+ input_parameters = (
331
+ part.args_as_dict() if hasattr(part, "args_as_dict") else None
332
+ )
333
+
334
+ # Create and append ToolCall object
335
+ tool_call = ToolCall(
336
+ name=tool_name, input_parameters=input_parameters
337
+ )
338
+ tool_calls.append(tool_call)
339
+
340
+ return tool_calls
341
+
342
+
343
+ # TODO: llm tools called (reposne is present next message)
344
+ def _extract_tools_called(result: AgentRunResult) -> List[ToolCall]:
345
+ tool_calls = []
346
+
347
+ # Access the message history from the _state
348
+ message_history = result._state.message_history
349
+
350
+ # Scan through all messages in the history
351
+ for message in message_history:
352
+ # Check if this is a ModelResponse (kind="response")
353
+ if hasattr(message, "kind") and message.kind == "response":
354
+ # For ModelResponse messages, check each part
355
+ if hasattr(message, "parts"):
356
+ for part in message.parts:
357
+ # Look for parts with part_kind="tool-call"
358
+ if (
359
+ hasattr(part, "part_kind")
360
+ and part.part_kind == "tool-call"
361
+ ):
362
+ # Extract tool name and args from the ToolCallPart
363
+ tool_name = part.tool_name
364
+ input_parameters = (
365
+ part.args_as_dict()
366
+ if hasattr(part, "args_as_dict")
367
+ else None
368
+ )
369
+
370
+ # Create and append ToolCall object
371
+ tool_call = ToolCall(
372
+ name=tool_name, input_parameters=input_parameters
373
+ )
374
+ tool_calls.append(tool_call)
375
+
376
+ return tool_calls
deepeval/key_handler.py CHANGED
@@ -80,6 +80,7 @@ class ModelKeyValues(Enum):
80
80
  OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
81
81
  OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
82
82
  OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
83
+ OPENAI_API_KEY = "OPENAI_API_KEY"
83
84
  # Moonshot
84
85
  USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
85
86
  MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
@@ -37,7 +37,7 @@ JSON:
37
37
  Please generate a list of JSON with two keys: `verdict` and `reason`.
38
38
  The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement is relevant to addressing the original input, 'no' if the statement is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
39
39
  The 'reason' is the reason for the verdict.
40
- Provide a 'reason' ONLY if the answer is 'no'.
40
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
41
41
  The provided statements are statements made in the actual output.
42
42
 
43
43
  **
@@ -53,7 +53,8 @@ Example statements:
53
53
  "Security features include fingerprint authentication and an encrypted SSD.",
54
54
  "Every purchase comes with a one-year warranty.",
55
55
  "24/7 customer support is included.",
56
- "Pineapples taste great on pizza."
56
+ "Pineapples taste great on pizza.",
57
+ "The laptop is a Dell XPS 13."
57
58
  ]
58
59
 
59
60
  Example JSON:
@@ -79,6 +80,10 @@ Example JSON:
79
80
  {{
80
81
  "verdict": "no",
81
82
  "reason": "The statement about pineapples on pizza is completely irrelevant to the input, which asks about laptop features."
83
+ }},
84
+ {{
85
+ "verdict": "idk",
86
+ "reason": "The statement about the laptop being a Dell XPS 13 is not directly relevant to the input, but could be used as a supporting point to address the input."
82
87
  }}
83
88
  ]
84
89
  }}
@@ -4,7 +4,7 @@ from typing import Optional, List
4
4
  class FaithfulnessTemplate:
5
5
  @staticmethod
6
6
  def generate_claims(actual_output: str):
7
- return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided text.
7
+ return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output.
8
8
  These truths, MUST BE COHERENT, and CANNOT be taken out of context.
9
9
 
10
10
  Example:
@@ -24,9 +24,10 @@ Example JSON:
24
24
  IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
25
25
  Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
26
26
  You should NOT include any prior knowledge, and take the text at face value when extracting claims.
27
+ You should be aware that it is an AI that is outputting these claims.
27
28
  **
28
29
 
29
- Text:
30
+ AI Output:
30
31
  {actual_output}
31
32
 
32
33
  JSON:
@@ -72,7 +73,7 @@ JSON:
72
73
  def generate_verdicts(claims: List[str], retrieval_context: str):
73
74
  return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
74
75
  The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
75
- Provide a 'reason' ONLY if the answer is 'no'.
76
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
76
77
  The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
77
78
 
78
79
  **
@@ -84,28 +85,30 @@ Example:
84
85
  {{
85
86
  "verdicts": [
86
87
  {{
87
- "verdict": "idk"
88
+ "verdict": "idk",
89
+ "reason": "The claim about Barack Obama is although incorrect, it is not directly addressed in the retrieval context, and so poses no contradiction."
88
90
  }},
89
91
  {{
90
- "verdict": "idk"
92
+ "verdict": "idk",
93
+ "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
91
94
  }},
92
95
  {{
93
96
  "verdict": "yes"
94
97
  }},
95
98
  {{
96
99
  "verdict": "no",
97
- "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
100
+ "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead. This contradicts the retrieval context."
98
101
  }},
99
102
  {{
100
103
  "verdict": "no",
101
- "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead."
104
+ "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead. This contradicts the retrieval context."
102
105
  }},
103
106
  ]
104
107
  }}
105
108
  ===== END OF EXAMPLE ======
106
109
 
107
110
  The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
108
- You DON'T have to provide a reason if the answer is 'yes' or 'idk'.
111
+ You DON'T have to provide a reason if the answer is 'yes'.
109
112
  ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
110
113
  Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
111
114
  Claims that are not backed up by the retrieval context or are not mentioned in it MUST be answered 'idk'.
@@ -39,7 +39,7 @@ class MultimodalAnswerRelevancyTemplate:
39
39
  Please generate a list of JSON with two keys: `verdict` and `reason`.
40
40
  The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement or image is relevant to addressing the original input, 'no' if the statement or image is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
41
41
  The 'reason' is the reason for the verdict.
42
- Provide a 'reason' ONLY if the answer is 'no'.
42
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
43
43
  The provided statements are statements and images generated in the actual output.
44
44
 
45
45
  **
@@ -54,13 +54,15 @@ class MultimodalAnswerRelevancyTemplate:
54
54
  "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake."
55
55
  }},
56
56
  {{
57
- "verdict": "idk"
57
+ "verdict": "idk",
58
+ "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant."
58
59
  }},
59
60
  {{
60
- "verdict": "idk"
61
+ "verdict": "idk",
62
+ "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant."
61
63
  }},
62
64
  {{
63
- "verdict": "yes"
65
+ "verdict": "yes",
64
66
  }}
65
67
  ]
66
68
  }}
@@ -95,7 +95,7 @@ class MultimodalFaithfulnessTemplate:
95
95
  return textwrap.dedent(
96
96
  f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
97
97
  The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
98
- Provide a 'reason' ONLY if the answer is 'no'.
98
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
99
99
  The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
100
100
 
101
101
  **
@@ -107,10 +107,12 @@ class MultimodalFaithfulnessTemplate:
107
107
  {{
108
108
  "verdicts": [
109
109
  {{
110
- "verdict": "idk"
110
+ "verdict": "idk",
111
+ "reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction."
111
112
  }},
112
113
  {{
113
- "verdict": "idk"
114
+ "verdict": "idk",
115
+ "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
114
116
  }},
115
117
  {{
116
118
  "verdict": "yes"
@@ -128,7 +130,7 @@ class MultimodalFaithfulnessTemplate:
128
130
  ===== END OF EXAMPLE ======
129
131
 
130
132
  The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
131
- You DON'T have to provide a reason if the answer is 'yes' or 'idk'.
133
+ You DON'T have to provide a reason if the answer is 'yes'.
132
134
  ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
133
135
  Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
134
136
  Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
@@ -223,9 +223,13 @@ class ToolCorrectnessMetric(BaseMetric):
223
223
  total_score += best_score
224
224
  matched_called_tools.add(best_called_tool)
225
225
  return (
226
- total_score / len(self.expected_tools)
227
- if self.expected_tools
228
- else 0.0
226
+ 1.0
227
+ if not self.expected_tools and not self.tools_called
228
+ else (
229
+ 0.0
230
+ if not self.expected_tools
231
+ else total_score / len(self.expected_tools)
232
+ )
229
233
  )
230
234
 
231
235
  # Consider ordering score
@@ -115,13 +115,34 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
115
115
  ###############################################
116
116
 
117
117
  def get_converse_request_body(self, prompt: str) -> dict:
118
+ # Inline parameter translation with defaults
119
+ param_mapping = {
120
+ "max_tokens": "maxTokens",
121
+ "top_p": "topP",
122
+ "top_k": "topK",
123
+ "stop_sequences": "stopSequences",
124
+ }
125
+
126
+ # Start with defaults for required parameters
127
+ translated_kwargs = {
128
+ "maxTokens": self.generation_kwargs.get("max_tokens", 1000),
129
+ "topP": self.generation_kwargs.get("top_p", 0),
130
+ }
131
+
132
+ # Add any other parameters from generation_kwargs
133
+ for key, value in self.generation_kwargs.items():
134
+ if key not in [
135
+ "max_tokens",
136
+ "top_p",
137
+ ]: # Skip already handled defaults
138
+ aws_key = param_mapping.get(key, key)
139
+ translated_kwargs[aws_key] = value
140
+
118
141
  return {
119
142
  "messages": [{"role": "user", "content": [{"text": prompt}]}],
120
143
  "inferenceConfig": {
121
144
  "temperature": self.temperature,
122
- "topP": self.generation_kwargs.get("top_p", 0),
123
- "maxTokens": self.generation_kwargs.get("max_tokens", 1000),
124
- **self.generation_kwargs,
145
+ **translated_kwargs,
125
146
  },
126
147
  }
127
148
 
@@ -30,7 +30,7 @@ model_pricing = {
30
30
  },
31
31
  "grok-3-fast": {
32
32
  "input": 0.60 / 1e6,
33
- "output": 2.50 / 1e-6,
33
+ "output": 2.50 / 1e6,
34
34
  },
35
35
  "grok-3-mini-fast": {
36
36
  "input": 30 / 1e6,
@@ -30,7 +30,7 @@ model_pricing = {
30
30
  },
31
31
  "kimi-k2-0711-preview": {
32
32
  "input": 0.60 / 1e6,
33
- "output": 2.50 / 1e-6,
33
+ "output": 2.50 / 1e6,
34
34
  },
35
35
  "kimi-thinking-preview": {
36
36
  "input": 30 / 1e6,