deepeval 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +658 -262
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/test_run_tracer.py +4 -6
  8. deepeval/evaluate/execute.py +153 -94
  9. deepeval/integrations/pydantic_ai/instrumentator.py +4 -2
  10. deepeval/integrations/pydantic_ai/otel.py +5 -1
  11. deepeval/key_handler.py +121 -51
  12. deepeval/metrics/base_metric.py +9 -3
  13. deepeval/metrics/g_eval/g_eval.py +6 -1
  14. deepeval/metrics/indicator.py +8 -4
  15. deepeval/metrics/mcp/mcp_task_completion.py +15 -16
  16. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +15 -15
  17. deepeval/metrics/mcp/schema.py +4 -0
  18. deepeval/metrics/mcp/template.py +8 -1
  19. deepeval/metrics/prompt_alignment/prompt_alignment.py +6 -3
  20. deepeval/metrics/tool_use/schema.py +4 -0
  21. deepeval/metrics/tool_use/template.py +16 -2
  22. deepeval/metrics/tool_use/tool_use.py +30 -28
  23. deepeval/metrics/topic_adherence/schema.py +4 -0
  24. deepeval/metrics/topic_adherence/template.py +8 -1
  25. deepeval/metrics/topic_adherence/topic_adherence.py +15 -14
  26. deepeval/metrics/turn_contextual_precision/template.py +8 -1
  27. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +44 -86
  28. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  29. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +44 -82
  30. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  31. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +48 -92
  32. deepeval/metrics/turn_faithfulness/template.py +8 -1
  33. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +76 -130
  34. deepeval/metrics/utils.py +16 -1
  35. deepeval/models/__init__.py +2 -0
  36. deepeval/models/llms/__init__.py +2 -0
  37. deepeval/models/llms/amazon_bedrock_model.py +5 -4
  38. deepeval/models/llms/anthropic_model.py +4 -3
  39. deepeval/models/llms/azure_model.py +4 -3
  40. deepeval/models/llms/deepseek_model.py +5 -8
  41. deepeval/models/llms/grok_model.py +5 -8
  42. deepeval/models/llms/kimi_model.py +5 -8
  43. deepeval/models/llms/litellm_model.py +2 -0
  44. deepeval/models/llms/local_model.py +1 -1
  45. deepeval/models/llms/openai_model.py +4 -3
  46. deepeval/models/retry_policy.py +10 -5
  47. deepeval/models/utils.py +1 -5
  48. deepeval/simulator/conversation_simulator.py +6 -2
  49. deepeval/simulator/template.py +3 -1
  50. deepeval/synthesizer/synthesizer.py +19 -17
  51. deepeval/test_run/test_run.py +6 -1
  52. deepeval/utils.py +26 -0
  53. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/METADATA +3 -3
  54. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/RECORD +57 -56
  55. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/WHEEL +0 -0
  57. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/entry_points.txt +0 -0
deepeval/key_handler.py CHANGED
@@ -5,7 +5,9 @@ import json
5
5
  import logging
6
6
 
7
7
  from enum import Enum
8
- from typing import Union
8
+ from functools import lru_cache
9
+ from pydantic import SecretStr
10
+ from typing import get_args, get_origin, Union
9
11
 
10
12
  from .constants import KEY_FILE, HIDDEN_DIR
11
13
 
@@ -13,26 +15,34 @@ from .constants import KEY_FILE, HIDDEN_DIR
13
15
  logger = logging.getLogger(__name__)
14
16
 
15
17
 
16
- SECRET_KEYS = {
17
- # General providers
18
- "OPENAI_API_KEY",
19
- "ANTHROPIC_API_KEY",
20
- # Azure OpenAI
21
- "AZURE_OPENAI_API_KEY",
22
- # Google / Gemini
23
- "GOOGLE_API_KEY",
24
- # xAI Grok
25
- "GROK_API_KEY",
26
- # Moonshot
27
- "MOONSHOT_API_KEY",
28
- # DeepSeek
29
- "DEEPSEEK_API_KEY",
30
- # LiteLLM
31
- "LITELLM_API_KEY",
32
- # Local gateways (if any require keys)
33
- "LOCAL_MODEL_API_KEY",
34
- "LOCAL_EMBEDDING_API_KEY",
35
- }
18
+ @lru_cache(maxsize=1)
19
+ def _secret_env_keys() -> frozenset[str]:
20
+ # Lazy import avoids cycles at import time
21
+ from deepeval.config.settings import Settings
22
+
23
+ secret_keys: set[str] = set()
24
+ for env_key, field in Settings.model_fields.items():
25
+ ann = field.annotation
26
+ if ann is SecretStr:
27
+ secret_keys.add(env_key)
28
+ continue
29
+
30
+ origin = get_origin(ann)
31
+ if origin is Union and any(a is SecretStr for a in get_args(ann)):
32
+ secret_keys.add(env_key)
33
+
34
+ return frozenset(secret_keys)
35
+
36
+
37
+ def _env_key_for_legacy_enum(key) -> str:
38
+ # For ModelKeyValues, .name == .value, for KeyValues it's the important one:
39
+ # KeyValues.API_KEY.name == "API_KEY" (matches Settings), value == "api_key" (legacy json key)
40
+ return getattr(key, "name", str(key))
41
+
42
+
43
+ def _is_secret_key(key) -> bool:
44
+ return _env_key_for_legacy_enum(key) in _secret_env_keys()
45
+
36
46
 
37
47
  _WARNED_SECRET_KEYS = set()
38
48
 
@@ -40,7 +50,10 @@ _WARNED_SECRET_KEYS = set()
40
50
  class KeyValues(Enum):
41
51
  # Confident AI
42
52
  API_KEY = "api_key"
53
+ CONFIDENT_API_KEY = "confident_api_key"
54
+ CONFIDENT_BASE_URL = "confident_base_url"
43
55
  CONFIDENT_REGION = "confident_region"
56
+
44
57
  # Cache
45
58
  LAST_TEST_RUN_LINK = "last_test_run_link"
46
59
  LAST_TEST_RUN_DATA = "last_test_run_data"
@@ -49,6 +62,24 @@ class KeyValues(Enum):
49
62
  class ModelKeyValues(Enum):
50
63
  # General
51
64
  TEMPERATURE = "TEMPERATURE"
65
+
66
+ # Anthropic
67
+ USE_ANTHROPIC_MODEL = "USE_ANTHROPIC_MODEL"
68
+ ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
69
+ ANTHROPIC_MODEL_NAME = "ANTHROPIC_MODEL_NAME"
70
+ ANTHROPIC_COST_PER_INPUT_TOKEN = "ANTHROPIC_COST_PER_INPUT_TOKEN"
71
+ ANTHROPIC_COST_PER_OUTPUT_TOKEN = "ANTHROPIC_COST_PER_OUTPUT_TOKEN"
72
+
73
+ # AWS
74
+ AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID"
75
+ AWS_SECRET_ACCESS_KEY = "AWS_SECRET_ACCESS_KEY"
76
+ # AWS Bedrock
77
+ USE_AWS_BEDROCK_MODEL = "USE_AWS_BEDROCK_MODEL"
78
+ AWS_BEDROCK_MODEL_NAME = "AWS_BEDROCK_MODEL_NAME"
79
+ AWS_BEDROCK_REGION = "AWS_BEDROCK_REGION"
80
+ AWS_BEDROCK_COST_PER_INPUT_TOKEN = "AWS_BEDROCK_COST_PER_INPUT_TOKEN"
81
+ AWS_BEDROCK_COST_PER_OUTPUT_TOKEN = "AWS_BEDROCK_COST_PER_OUTPUT_TOKEN"
82
+
52
83
  # Azure Open AI
53
84
  AZURE_OPENAI_API_KEY = "AZURE_OPENAI_API_KEY"
54
85
  AZURE_OPENAI_ENDPOINT = "AZURE_OPENAI_ENDPOINT"
@@ -57,43 +88,79 @@ class ModelKeyValues(Enum):
57
88
  AZURE_MODEL_NAME = "AZURE_MODEL_NAME"
58
89
  AZURE_MODEL_VERSION = "AZURE_MODEL_VERSION"
59
90
  USE_AZURE_OPENAI = "USE_AZURE_OPENAI"
60
- # Local Model
61
- LOCAL_MODEL_NAME = "LOCAL_MODEL_NAME"
62
- LOCAL_MODEL_BASE_URL = "LOCAL_MODEL_BASE_URL"
63
- LOCAL_MODEL_API_KEY = "LOCAL_MODEL_API_KEY"
64
- LOCAL_MODEL_FORMAT = "LOCAL_MODEL_FORMAT"
65
- USE_LOCAL_MODEL = "USE_LOCAL_MODEL"
91
+
92
+ # DeepSeek
93
+ USE_DEEPSEEK_MODEL = "USE_DEEPSEEK_MODEL"
94
+ DEEPSEEK_API_KEY = "DEEPSEEK_API_KEY"
95
+ DEEPSEEK_MODEL_NAME = "DEEPSEEK_MODEL_NAME"
96
+ DEEPSEEK_COST_PER_INPUT_TOKEN = "DEEPSEEK_COST_PER_INPUT_TOKEN"
97
+ DEEPSEEK_COST_PER_OUTPUT_TOKEN = "DEEPSEEK_COST_PER_OUTPUT_TOKEN"
98
+
66
99
  # Gemini
67
100
  USE_GEMINI_MODEL = "USE_GEMINI_MODEL"
68
- GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
69
101
  GOOGLE_API_KEY = "GOOGLE_API_KEY"
102
+ GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
70
103
  GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
71
104
  GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
72
105
  GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
73
106
  GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
107
+
108
+ # Grok
109
+ USE_GROK_MODEL = "USE_GROK_MODEL"
110
+ GROK_API_KEY = "GROK_API_KEY"
111
+ GROK_MODEL_NAME = "GROK_MODEL_NAME"
112
+ GROK_COST_PER_INPUT_TOKEN = "GROK_COST_PER_INPUT_TOKEN"
113
+ GROK_COST_PER_OUTPUT_TOKEN = "GROK_COST_PER_OUTPUT_TOKEN"
114
+
74
115
  # LiteLLM
75
116
  USE_LITELLM = "USE_LITELLM"
76
- LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
77
117
  LITELLM_API_KEY = "LITELLM_API_KEY"
118
+ LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
78
119
  LITELLM_API_BASE = "LITELLM_API_BASE"
120
+ LITELLM_PROXY_API_BASE = "LITELLM_PROXY_API_BASE"
121
+ LITELLM_PROXY_API_KEY = "LITELLM_PROXY_API_KEY"
122
+
123
+ # LM Studio
124
+ LM_STUDIO_API_KEY = "LM_STUDIO_API_KEY"
125
+ LM_STUDIO_MODEL_NAME = "LM_STUDIO_MODEL_NAME"
126
+
127
+ # Local Model
128
+ USE_LOCAL_MODEL = "USE_LOCAL_MODEL"
129
+ LOCAL_MODEL_API_KEY = "LOCAL_MODEL_API_KEY"
130
+ LOCAL_MODEL_NAME = "LOCAL_MODEL_NAME"
131
+ LOCAL_MODEL_BASE_URL = "LOCAL_MODEL_BASE_URL"
132
+ LOCAL_MODEL_FORMAT = "LOCAL_MODEL_FORMAT"
133
+
134
+ # Moonshot
135
+ USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
136
+ MOONSHOT_API_KEY = "MOONSHOT_API_KEY"
137
+ MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
138
+ MOONSHOT_COST_PER_INPUT_TOKEN = "MOONSHOT_COST_PER_INPUT_TOKEN"
139
+ MOONSHOT_COST_PER_OUTPUT_TOKEN = "MOONSHOT_COST_PER_OUTPUT_TOKEN"
140
+
141
+ # Ollama
142
+ OLLAMA_MODEL_NAME = "OLLAMA_MODEL_NAME"
143
+
79
144
  # OpenAI
80
145
  USE_OPENAI_MODEL = "USE_OPENAI_MODEL"
146
+ OPENAI_API_KEY = "OPENAI_API_KEY"
81
147
  OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
82
148
  OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
83
149
  OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
84
- OPENAI_API_KEY = "OPENAI_API_KEY"
85
- # Moonshot
86
- USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
87
- MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
88
- MOONSHOT_API_KEY = "MOONSHOT_API_KEY"
89
- # Grok
90
- USE_GROK_MODEL = "USE_GROK_MODEL"
91
- GROK_MODEL_NAME = "GROK_MODEL_NAME"
92
- GROK_API_KEY = "GROK_API_KEY"
93
- # DeepSeek
94
- USE_DEEPSEEK_MODEL = "USE_DEEPSEEK_MODEL"
95
- DEEPSEEK_MODEL_NAME = "DEEPSEEK_MODEL_NAME"
96
- DEEPSEEK_API_KEY = "DEEPSEEK_API_KEY"
150
+
151
+ # PortKey
152
+ USE_PORTKEY_MODEL = "USE_PORTKEY_MODEL"
153
+ PORTKEY_API_KEY = "PORTKEY_API_KEY"
154
+ PORTKEY_MODEL_NAME = "PORTKEY_MODEL_NAME"
155
+ PORTKEY_BASE_URL = "PORTKEY_BASE_URL"
156
+ PORTKEY_PROVIDER_NAME = "PORTKEY_PROVIDER_NAME"
157
+
158
+ # Vertex AI
159
+ VERTEX_AI_MODEL_NAME = "VERTEX_AI_MODEL_NAME"
160
+
161
+ # VLLM
162
+ VLLM_API_KEY = "VLLM_API_KEY"
163
+ VLLM_MODEL_NAME = "VLLM_MODEL_NAME"
97
164
 
98
165
 
99
166
  class EmbeddingKeyValues(Enum):
@@ -123,9 +190,11 @@ class KeyFileHandler:
123
190
  """Appends or updates data in the hidden file"""
124
191
 
125
192
  # hard stop on secrets: never write to disk
126
- if key.value in SECRET_KEYS:
193
+ if _is_secret_key(key):
127
194
  logger.warning(
128
- f"{key} is blacklisted, refusing to persist. Keep your secrets in .env or .env.local instead"
195
+ "%s is a secret setting, refusing to persist. "
196
+ "Keep your secrets in .env or .env.local instead.",
197
+ _env_key_for_legacy_enum(key),
129
198
  )
130
199
  return
131
200
 
@@ -170,16 +239,17 @@ class KeyFileHandler:
170
239
  # Deprecation: warn only if we're actually returning a secret
171
240
  if (
172
241
  value is not None
173
- and key.value in SECRET_KEYS
174
- and key.value not in _WARNED_SECRET_KEYS
242
+ and _is_secret_key(key)
243
+ and _env_key_for_legacy_enum(key) not in _WARNED_SECRET_KEYS
175
244
  ):
176
245
  logger.warning(
177
- f"Reading secret '{key.value}' from legacy {HIDDEN_DIR}/{KEY_FILE}. "
178
- "Persisting API keys in plaintext is deprecated. "
179
- "Move this to your environment (.env / .env.local). "
180
- "This fallback will be removed in a future release."
246
+ "Reading secret '%s' from legacy %s/%s. Persisting API keys in plaintext is deprecated. "
247
+ "Move this to your environment (.env / .env.local). This fallback will be removed in a future release.",
248
+ _env_key_for_legacy_enum(key),
249
+ HIDDEN_DIR,
250
+ KEY_FILE,
181
251
  )
182
- _WARNED_SECRET_KEYS.add(key.value)
252
+ _WARNED_SECRET_KEYS.add(_env_key_for_legacy_enum(key))
183
253
 
184
254
  return value
185
255
 
@@ -49,8 +49,10 @@ class BaseMetric:
49
49
  return "Base Metric"
50
50
 
51
51
  def _accrue_cost(self, cost: float) -> None:
52
- if self.evaluation_cost is not None:
52
+ if self.evaluation_cost is not None and cost is not None:
53
53
  self.evaluation_cost += cost
54
+ else:
55
+ self.evaluation_cost = None
54
56
 
55
57
 
56
58
  class BaseConversationalMetric:
@@ -94,8 +96,10 @@ class BaseConversationalMetric:
94
96
  return "Base Conversational Metric"
95
97
 
96
98
  def _accrue_cost(self, cost: float) -> None:
97
- if self.evaluation_cost is not None:
99
+ if self.evaluation_cost is not None and cost is not None:
98
100
  self.evaluation_cost += cost
101
+ else:
102
+ self.evaluation_cost = None
99
103
 
100
104
 
101
105
  class BaseArenaMetric:
@@ -129,5 +133,7 @@ class BaseArenaMetric:
129
133
  return "Base Arena Metric"
130
134
 
131
135
  def _accrue_cost(self, cost: float) -> None:
132
- if self.evaluation_cost is not None:
136
+ if self.evaluation_cost is not None and cost is not None:
133
137
  self.evaluation_cost += cost
138
+ else:
139
+ self.evaluation_cost = None
@@ -110,10 +110,15 @@ class GEval(BaseMetric):
110
110
  _in_component=_in_component,
111
111
  _additional_context=_additional_context,
112
112
  )
113
+ settings = get_settings()
113
114
  loop.run_until_complete(
114
115
  asyncio.wait_for(
115
116
  coro,
116
- timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
117
+ timeout=(
118
+ None
119
+ if settings.DEEPEVAL_DISABLE_TIMEOUTS
120
+ else settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
121
+ ),
117
122
  )
118
123
  )
119
124
  else:
@@ -1,10 +1,11 @@
1
+ import asyncio
2
+ import logging
3
+ import sys
4
+ import time
1
5
  from rich.console import Console
2
6
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
3
7
  from contextlib import contextmanager
4
- import sys
5
8
  from typing import List, Optional, Union
6
- import time
7
- import asyncio
8
9
 
9
10
  from deepeval.errors import MissingTestCaseParamsError
10
11
  from deepeval.metrics import (
@@ -16,8 +17,8 @@ from deepeval.test_case import LLMTestCase, ConversationalTestCase
16
17
  from deepeval.test_run.cache import CachedTestCase, Cache
17
18
  from deepeval.telemetry import capture_metric_type
18
19
  from deepeval.utils import update_pbar
20
+ from deepeval.config.settings import get_settings
19
21
 
20
- import logging
21
22
 
22
23
  logger = logging.getLogger(__name__)
23
24
 
@@ -260,6 +261,9 @@ async def safe_a_measure(
260
261
  "Timed out/cancelled while evaluating metric. "
261
262
  "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
262
263
  "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
264
+ if not get_settings().DEEPEVAL_DISABLE_TIMEOUTS
265
+ else "Cancelled while evaluating metric (DeepEval timeouts are disabled; this likely came from upstream orchestration or the provider/network layer). "
266
+ "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
263
267
  )
264
268
  metric.success = False
265
269
 
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
14
14
  from deepeval.metrics.indicator import metric_progress_indicator
15
15
  from deepeval.test_case import ConversationalTestCase, TurnParams
16
16
  from deepeval.utils import get_or_create_event_loop, prettify_list
17
- from deepeval.metrics.mcp.schema import Task, TaskScore
17
+ from deepeval.metrics.mcp.schema import Task, TaskScore, Reason
18
18
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
19
19
  from deepeval.errors import MissingTestCaseParamsError
20
20
  from deepeval.metrics.api import metric_data_manager
@@ -171,14 +171,13 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
171
171
  prompt = MCPTaskCompletionTemplate.generate_final_reason(
172
172
  self.score, self.success, reasons
173
173
  )
174
-
175
- if self.using_native_model:
176
- res, cost = self.model.generate(prompt)
177
- self.evaluation_cost += cost
178
- return res
179
- else:
180
- res = self.model.generate(prompt)
181
- return res
174
+ return generate_with_schema_and_extract(
175
+ metric=self,
176
+ prompt=prompt,
177
+ schema_cls=Reason,
178
+ extract_schema=lambda s: s.reason,
179
+ extract_json=lambda data: data["reason"],
180
+ )
182
181
 
183
182
  async def _a_generate_reason(
184
183
  self, task_scores: List[TaskScore]
@@ -194,13 +193,13 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
194
193
  self.score, self.success, reasons
195
194
  )
196
195
 
197
- if self.using_native_model:
198
- res, cost = await self.model.a_generate(prompt)
199
- self.evaluation_cost += cost
200
- return res
201
- else:
202
- res = await self.model.a_generate(prompt)
203
- return res
196
+ return await a_generate_with_schema_and_extract(
197
+ metric=self,
198
+ prompt=prompt,
199
+ schema_cls=Reason,
200
+ extract_schema=lambda s: s.reason,
201
+ extract_json=lambda data: data["reason"],
202
+ )
204
203
 
205
204
  def _get_task_score(self, task: Task) -> TaskScore:
206
205
  prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
14
14
  from deepeval.metrics.indicator import metric_progress_indicator
15
15
  from deepeval.test_case import ConversationalTestCase, TurnParams
16
16
  from deepeval.utils import get_or_create_event_loop, prettify_list
17
- from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
17
+ from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore, Reason
18
18
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
19
19
  from deepeval.errors import MissingTestCaseParamsError
20
20
  from deepeval.metrics.api import metric_data_manager
@@ -336,13 +336,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
336
336
  self.score, self.success, reasons
337
337
  )
338
338
 
339
- if self.using_native_model:
340
- res, cost = self.model.generate(prompt)
341
- self.evaluation_cost += cost
342
- return res
343
- else:
344
- res = self.model.generate(prompt)
345
- return res
339
+ return generate_with_schema_and_extract(
340
+ metric=self,
341
+ prompt=prompt,
342
+ schema_cls=Reason,
343
+ extract_schema=lambda s: s.reason,
344
+ extract_json=lambda data: data["reason"],
345
+ )
346
346
 
347
347
  async def _a_generate_reason(
348
348
  self,
@@ -363,13 +363,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
363
363
  self.score, self.success, reasons
364
364
  )
365
365
 
366
- if self.using_native_model:
367
- res, cost = await self.model.a_generate(prompt)
368
- self.evaluation_cost += cost
369
- return res
370
- else:
371
- res = await self.model.a_generate(prompt)
372
- return res
366
+ return await a_generate_with_schema_and_extract(
367
+ metric=self,
368
+ prompt=prompt,
369
+ schema_cls=Reason,
370
+ extract_schema=lambda s: s.reason,
371
+ extract_json=lambda data: data["reason"],
372
+ )
373
373
 
374
374
  def is_successful(self) -> bool:
375
375
  if self.error is not None:
@@ -20,3 +20,7 @@ class ToolScore(BaseModel):
20
20
  class ArgsScore(BaseModel):
21
21
  score: float
22
22
  reason: str
23
+
24
+
25
+ class Reason(BaseModel):
26
+ reason: str
@@ -148,6 +148,13 @@ JSON:
148
148
  Context:
149
149
  The reasons are from metrics that were used to evaluate an MCP application by determining whether the model accurately completed a task or called toos and resources with the right arguments.
150
150
 
151
+ **
152
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
153
+ Example JSON:
154
+ {{
155
+ "reason": "The score is <score> because <your_reason>."
156
+ }}
157
+
151
158
  Inputs:
152
159
  - final_score: the averaged score across all interactions.
153
160
  - success: whether the metric passed or failed
@@ -173,5 +180,5 @@ JSON:
173
180
 
174
181
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
175
182
 
176
- The final reason:
183
+ JSON:
177
184
  """
@@ -2,7 +2,11 @@ import asyncio
2
2
 
3
3
  from typing import Optional, List, Union
4
4
 
5
- from deepeval.utils import get_or_create_event_loop, prettify_list
5
+ from deepeval.utils import (
6
+ get_or_create_event_loop,
7
+ prettify_list,
8
+ get_per_task_timeout,
9
+ )
6
10
  from deepeval.metrics.utils import (
7
11
  construct_verbose_logs,
8
12
  check_llm_test_case_params,
@@ -19,7 +23,6 @@ from deepeval.models import DeepEvalBaseLLM
19
23
  from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
20
24
  from deepeval.metrics.indicator import metric_progress_indicator
21
25
  from deepeval.metrics.prompt_alignment import schema as paschema
22
- from deepeval.config.settings import get_settings
23
26
 
24
27
  from deepeval.metrics.api import metric_data_manager
25
28
 
@@ -86,7 +89,7 @@ class PromptAlignmentMetric(BaseMetric):
86
89
  loop.run_until_complete(
87
90
  asyncio.wait_for(
88
91
  coro,
89
- timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
92
+ timeout=get_per_task_timeout(),
90
93
  )
91
94
  )
92
95
  else:
@@ -17,3 +17,7 @@ class ToolSelectionScore(BaseModel):
17
17
  class ArgumentCorrectnessScore(BaseModel):
18
18
  score: float
19
19
  reason: str
20
+
21
+
22
+ class Reason(BaseModel):
23
+ reason: str
@@ -161,6 +161,13 @@ class ToolUseTemplate:
161
161
  - The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
162
162
  - A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
163
163
 
164
+ **
165
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
166
+ Example JSON:
167
+ {{
168
+ "reason": "The score is <score> because <your_reason>."
169
+ }}
170
+
164
171
  RULES:
165
172
  - Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
166
173
  - Mention specific issues or strengths like redundancy, misuse, or perfect matching.
@@ -178,7 +185,7 @@ class ToolUseTemplate:
178
185
  Threshold: {threshold}
179
186
  Result: {"PASS" if final_score >= threshold else "FAIL"}
180
187
 
181
- Final Reason:
188
+ JSON:
182
189
  """
183
190
  )
184
191
 
@@ -199,6 +206,13 @@ class ToolUseTemplate:
199
206
  - The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
200
207
  - Whether the agent met or fell short of the threshold and why.
201
208
 
209
+ **
210
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
211
+ Example JSON:
212
+ {{
213
+ "reason": "The score is <score> because <your_reason>."
214
+ }}
215
+
202
216
  RULES:
203
217
  - Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
204
218
  - Reference specific argument-level problems or successes where helpful.
@@ -215,6 +229,6 @@ class ToolUseTemplate:
215
229
  Threshold: {threshold}
216
230
  Result: {"PASS" if final_score >= threshold else "FAIL"}
217
231
 
218
- Final Reason:
232
+ JSON:
219
233
  """
220
234
  )
@@ -23,6 +23,7 @@ from deepeval.metrics.tool_use.schema import (
23
23
  ToolSelectionScore,
24
24
  UserInputAndTools,
25
25
  ArgumentCorrectnessScore,
26
+ Reason,
26
27
  )
27
28
  from deepeval.metrics.api import metric_data_manager
28
29
 
@@ -356,13 +357,14 @@ class ToolUseMetric(BaseConversationalMetric):
356
357
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
357
358
  scores_and_reasons, self.score, self.threshold
358
359
  )
359
- if self.using_native_model:
360
- res, cost = self.model.generate(prompt)
361
- self.evaluation_cost += cost
362
- return res
363
- else:
364
- res = self.model.generate(prompt)
365
- return res
360
+
361
+ return generate_with_schema_and_extract(
362
+ metric=self,
363
+ prompt=prompt,
364
+ schema_cls=Reason,
365
+ extract_schema=lambda s: s.reason,
366
+ extract_json=lambda data: data["reason"],
367
+ )
366
368
 
367
369
  def _generate_reason_for_argument_correctness(
368
370
  self,
@@ -376,13 +378,13 @@ class ToolUseMetric(BaseConversationalMetric):
376
378
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
377
379
  scores_and_reasons, self.score, self.threshold
378
380
  )
379
- if self.using_native_model:
380
- res, cost = self.model.generate(prompt)
381
- self.evaluation_cost += cost
382
- return res
383
- else:
384
- res = self.model.generate(prompt)
385
- return res
381
+ return generate_with_schema_and_extract(
382
+ metric=self,
383
+ prompt=prompt,
384
+ schema_cls=Reason,
385
+ extract_schema=lambda s: s.reason,
386
+ extract_json=lambda data: data["reason"],
387
+ )
386
388
 
387
389
  async def _a_generate_reason_for_tool_selection(
388
390
  self, tool_use_scores: List[ToolSelectionScore]
@@ -395,13 +397,13 @@ class ToolUseMetric(BaseConversationalMetric):
395
397
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
396
398
  scores_and_reasons, self.score, self.threshold
397
399
  )
398
- if self.using_native_model:
399
- res, cost = await self.model.a_generate(prompt)
400
- self.evaluation_cost += cost
401
- return res
402
- else:
403
- res = await self.model.a_generate(prompt)
404
- return res
400
+ return await a_generate_with_schema_and_extract(
401
+ metric=self,
402
+ prompt=prompt,
403
+ schema_cls=Reason,
404
+ extract_schema=lambda s: s.reason,
405
+ extract_json=lambda data: data["reason"],
406
+ )
405
407
 
406
408
  async def _a_generate_reason_for_argument_correctness(
407
409
  self, argument_correctness_scores: List[ArgumentCorrectnessScore]
@@ -414,13 +416,13 @@ class ToolUseMetric(BaseConversationalMetric):
414
416
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
415
417
  scores_and_reasons, self.score, self.threshold
416
418
  )
417
- if self.using_native_model:
418
- res, cost = await self.model.a_generate(prompt)
419
- self.evaluation_cost += cost
420
- return res
421
- else:
422
- res = await self.model.a_generate(prompt)
423
- return res
419
+ return await a_generate_with_schema_and_extract(
420
+ metric=self,
421
+ prompt=prompt,
422
+ schema_cls=Reason,
423
+ extract_schema=lambda s: s.reason,
424
+ extract_json=lambda data: data["reason"],
425
+ )
424
426
 
425
427
  def is_successful(self) -> bool:
426
428
  try:
@@ -14,3 +14,7 @@ class QAPairs(BaseModel):
14
14
  class RelevancyVerdict(BaseModel):
15
15
  verdict: Literal["TP", "TN", "FP", "FN"]
16
16
  reason: str
17
+
18
+
19
+ class TopicAdherenceReason(BaseModel):
20
+ reason: str
@@ -149,6 +149,13 @@ class TopicAdherenceTemplate:
149
149
 
150
150
  Your task is to go through these reasons and give a single final explaination that clearly explains why this metric has failed or passed.
151
151
 
152
+ **
153
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
154
+ Example JSON:
155
+ {{
156
+ "reason": "The score is <score> because <your_reason>."
157
+ }}
158
+
152
159
  {TopicAdherenceTemplate.multimodal_rules}
153
160
 
154
161
  Pass: {success}
@@ -170,6 +177,6 @@ class TopicAdherenceTemplate:
170
177
 
171
178
  Output ONLY the reason, DON"T output anything else.
172
179
 
173
- Reason:
180
+ JSON:
174
181
  """
175
182
  )