deepeval 3.7.6__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +658 -262
- deepeval/config/utils.py +9 -1
- deepeval/evaluate/execute.py +153 -94
- deepeval/key_handler.py +121 -51
- deepeval/metrics/base_metric.py +9 -3
- deepeval/metrics/g_eval/g_eval.py +6 -1
- deepeval/metrics/indicator.py +8 -4
- deepeval/metrics/mcp/mcp_task_completion.py +15 -16
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +15 -15
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +8 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +6 -3
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +30 -28
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +8 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +15 -14
- deepeval/metrics/turn_contextual_precision/template.py +8 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +44 -86
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +44 -82
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +48 -92
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +76 -130
- deepeval/metrics/utils.py +16 -1
- deepeval/models/__init__.py +2 -0
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +5 -4
- deepeval/models/llms/anthropic_model.py +4 -3
- deepeval/models/llms/azure_model.py +4 -3
- deepeval/models/llms/deepseek_model.py +5 -8
- deepeval/models/llms/grok_model.py +5 -8
- deepeval/models/llms/kimi_model.py +5 -8
- deepeval/models/llms/litellm_model.py +2 -0
- deepeval/models/llms/local_model.py +1 -1
- deepeval/models/llms/openai_model.py +4 -3
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +1 -5
- deepeval/simulator/conversation_simulator.py +6 -2
- deepeval/simulator/template.py +3 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/RECORD +54 -53
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
deepeval/key_handler.py
CHANGED
|
@@ -5,7 +5,9 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
|
|
7
7
|
from enum import Enum
|
|
8
|
-
from
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
from pydantic import SecretStr
|
|
10
|
+
from typing import get_args, get_origin, Union
|
|
9
11
|
|
|
10
12
|
from .constants import KEY_FILE, HIDDEN_DIR
|
|
11
13
|
|
|
@@ -13,26 +15,34 @@ from .constants import KEY_FILE, HIDDEN_DIR
|
|
|
13
15
|
logger = logging.getLogger(__name__)
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
18
|
+
@lru_cache(maxsize=1)
|
|
19
|
+
def _secret_env_keys() -> frozenset[str]:
|
|
20
|
+
# Lazy import avoids cycles at import time
|
|
21
|
+
from deepeval.config.settings import Settings
|
|
22
|
+
|
|
23
|
+
secret_keys: set[str] = set()
|
|
24
|
+
for env_key, field in Settings.model_fields.items():
|
|
25
|
+
ann = field.annotation
|
|
26
|
+
if ann is SecretStr:
|
|
27
|
+
secret_keys.add(env_key)
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
origin = get_origin(ann)
|
|
31
|
+
if origin is Union and any(a is SecretStr for a in get_args(ann)):
|
|
32
|
+
secret_keys.add(env_key)
|
|
33
|
+
|
|
34
|
+
return frozenset(secret_keys)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _env_key_for_legacy_enum(key) -> str:
|
|
38
|
+
# For ModelKeyValues, .name == .value, for KeyValues it's the important one:
|
|
39
|
+
# KeyValues.API_KEY.name == "API_KEY" (matches Settings), value == "api_key" (legacy json key)
|
|
40
|
+
return getattr(key, "name", str(key))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _is_secret_key(key) -> bool:
|
|
44
|
+
return _env_key_for_legacy_enum(key) in _secret_env_keys()
|
|
45
|
+
|
|
36
46
|
|
|
37
47
|
_WARNED_SECRET_KEYS = set()
|
|
38
48
|
|
|
@@ -40,7 +50,10 @@ _WARNED_SECRET_KEYS = set()
|
|
|
40
50
|
class KeyValues(Enum):
|
|
41
51
|
# Confident AI
|
|
42
52
|
API_KEY = "api_key"
|
|
53
|
+
CONFIDENT_API_KEY = "confident_api_key"
|
|
54
|
+
CONFIDENT_BASE_URL = "confident_base_url"
|
|
43
55
|
CONFIDENT_REGION = "confident_region"
|
|
56
|
+
|
|
44
57
|
# Cache
|
|
45
58
|
LAST_TEST_RUN_LINK = "last_test_run_link"
|
|
46
59
|
LAST_TEST_RUN_DATA = "last_test_run_data"
|
|
@@ -49,6 +62,24 @@ class KeyValues(Enum):
|
|
|
49
62
|
class ModelKeyValues(Enum):
|
|
50
63
|
# General
|
|
51
64
|
TEMPERATURE = "TEMPERATURE"
|
|
65
|
+
|
|
66
|
+
# Anthropic
|
|
67
|
+
USE_ANTHROPIC_MODEL = "USE_ANTHROPIC_MODEL"
|
|
68
|
+
ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
|
|
69
|
+
ANTHROPIC_MODEL_NAME = "ANTHROPIC_MODEL_NAME"
|
|
70
|
+
ANTHROPIC_COST_PER_INPUT_TOKEN = "ANTHROPIC_COST_PER_INPUT_TOKEN"
|
|
71
|
+
ANTHROPIC_COST_PER_OUTPUT_TOKEN = "ANTHROPIC_COST_PER_OUTPUT_TOKEN"
|
|
72
|
+
|
|
73
|
+
# AWS
|
|
74
|
+
AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID"
|
|
75
|
+
AWS_SECRET_ACCESS_KEY = "AWS_SECRET_ACCESS_KEY"
|
|
76
|
+
# AWS Bedrock
|
|
77
|
+
USE_AWS_BEDROCK_MODEL = "USE_AWS_BEDROCK_MODEL"
|
|
78
|
+
AWS_BEDROCK_MODEL_NAME = "AWS_BEDROCK_MODEL_NAME"
|
|
79
|
+
AWS_BEDROCK_REGION = "AWS_BEDROCK_REGION"
|
|
80
|
+
AWS_BEDROCK_COST_PER_INPUT_TOKEN = "AWS_BEDROCK_COST_PER_INPUT_TOKEN"
|
|
81
|
+
AWS_BEDROCK_COST_PER_OUTPUT_TOKEN = "AWS_BEDROCK_COST_PER_OUTPUT_TOKEN"
|
|
82
|
+
|
|
52
83
|
# Azure Open AI
|
|
53
84
|
AZURE_OPENAI_API_KEY = "AZURE_OPENAI_API_KEY"
|
|
54
85
|
AZURE_OPENAI_ENDPOINT = "AZURE_OPENAI_ENDPOINT"
|
|
@@ -57,43 +88,79 @@ class ModelKeyValues(Enum):
|
|
|
57
88
|
AZURE_MODEL_NAME = "AZURE_MODEL_NAME"
|
|
58
89
|
AZURE_MODEL_VERSION = "AZURE_MODEL_VERSION"
|
|
59
90
|
USE_AZURE_OPENAI = "USE_AZURE_OPENAI"
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
91
|
+
|
|
92
|
+
# DeepSeek
|
|
93
|
+
USE_DEEPSEEK_MODEL = "USE_DEEPSEEK_MODEL"
|
|
94
|
+
DEEPSEEK_API_KEY = "DEEPSEEK_API_KEY"
|
|
95
|
+
DEEPSEEK_MODEL_NAME = "DEEPSEEK_MODEL_NAME"
|
|
96
|
+
DEEPSEEK_COST_PER_INPUT_TOKEN = "DEEPSEEK_COST_PER_INPUT_TOKEN"
|
|
97
|
+
DEEPSEEK_COST_PER_OUTPUT_TOKEN = "DEEPSEEK_COST_PER_OUTPUT_TOKEN"
|
|
98
|
+
|
|
66
99
|
# Gemini
|
|
67
100
|
USE_GEMINI_MODEL = "USE_GEMINI_MODEL"
|
|
68
|
-
GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
|
|
69
101
|
GOOGLE_API_KEY = "GOOGLE_API_KEY"
|
|
102
|
+
GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
|
|
70
103
|
GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
|
|
71
104
|
GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
|
|
72
105
|
GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
|
|
73
106
|
GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
|
|
107
|
+
|
|
108
|
+
# Grok
|
|
109
|
+
USE_GROK_MODEL = "USE_GROK_MODEL"
|
|
110
|
+
GROK_API_KEY = "GROK_API_KEY"
|
|
111
|
+
GROK_MODEL_NAME = "GROK_MODEL_NAME"
|
|
112
|
+
GROK_COST_PER_INPUT_TOKEN = "GROK_COST_PER_INPUT_TOKEN"
|
|
113
|
+
GROK_COST_PER_OUTPUT_TOKEN = "GROK_COST_PER_OUTPUT_TOKEN"
|
|
114
|
+
|
|
74
115
|
# LiteLLM
|
|
75
116
|
USE_LITELLM = "USE_LITELLM"
|
|
76
|
-
LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
|
|
77
117
|
LITELLM_API_KEY = "LITELLM_API_KEY"
|
|
118
|
+
LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
|
|
78
119
|
LITELLM_API_BASE = "LITELLM_API_BASE"
|
|
120
|
+
LITELLM_PROXY_API_BASE = "LITELLM_PROXY_API_BASE"
|
|
121
|
+
LITELLM_PROXY_API_KEY = "LITELLM_PROXY_API_KEY"
|
|
122
|
+
|
|
123
|
+
# LM Studio
|
|
124
|
+
LM_STUDIO_API_KEY = "LM_STUDIO_API_KEY"
|
|
125
|
+
LM_STUDIO_MODEL_NAME = "LM_STUDIO_MODEL_NAME"
|
|
126
|
+
|
|
127
|
+
# Local Model
|
|
128
|
+
USE_LOCAL_MODEL = "USE_LOCAL_MODEL"
|
|
129
|
+
LOCAL_MODEL_API_KEY = "LOCAL_MODEL_API_KEY"
|
|
130
|
+
LOCAL_MODEL_NAME = "LOCAL_MODEL_NAME"
|
|
131
|
+
LOCAL_MODEL_BASE_URL = "LOCAL_MODEL_BASE_URL"
|
|
132
|
+
LOCAL_MODEL_FORMAT = "LOCAL_MODEL_FORMAT"
|
|
133
|
+
|
|
134
|
+
# Moonshot
|
|
135
|
+
USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
|
|
136
|
+
MOONSHOT_API_KEY = "MOONSHOT_API_KEY"
|
|
137
|
+
MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
|
|
138
|
+
MOONSHOT_COST_PER_INPUT_TOKEN = "MOONSHOT_COST_PER_INPUT_TOKEN"
|
|
139
|
+
MOONSHOT_COST_PER_OUTPUT_TOKEN = "MOONSHOT_COST_PER_OUTPUT_TOKEN"
|
|
140
|
+
|
|
141
|
+
# Ollama
|
|
142
|
+
OLLAMA_MODEL_NAME = "OLLAMA_MODEL_NAME"
|
|
143
|
+
|
|
79
144
|
# OpenAI
|
|
80
145
|
USE_OPENAI_MODEL = "USE_OPENAI_MODEL"
|
|
146
|
+
OPENAI_API_KEY = "OPENAI_API_KEY"
|
|
81
147
|
OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
|
|
82
148
|
OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
|
|
83
149
|
OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
|
|
84
|
-
|
|
85
|
-
#
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
150
|
+
|
|
151
|
+
# PortKey
|
|
152
|
+
USE_PORTKEY_MODEL = "USE_PORTKEY_MODEL"
|
|
153
|
+
PORTKEY_API_KEY = "PORTKEY_API_KEY"
|
|
154
|
+
PORTKEY_MODEL_NAME = "PORTKEY_MODEL_NAME"
|
|
155
|
+
PORTKEY_BASE_URL = "PORTKEY_BASE_URL"
|
|
156
|
+
PORTKEY_PROVIDER_NAME = "PORTKEY_PROVIDER_NAME"
|
|
157
|
+
|
|
158
|
+
# Vertex AI
|
|
159
|
+
VERTEX_AI_MODEL_NAME = "VERTEX_AI_MODEL_NAME"
|
|
160
|
+
|
|
161
|
+
# VLLM
|
|
162
|
+
VLLM_API_KEY = "VLLM_API_KEY"
|
|
163
|
+
VLLM_MODEL_NAME = "VLLM_MODEL_NAME"
|
|
97
164
|
|
|
98
165
|
|
|
99
166
|
class EmbeddingKeyValues(Enum):
|
|
@@ -123,9 +190,11 @@ class KeyFileHandler:
|
|
|
123
190
|
"""Appends or updates data in the hidden file"""
|
|
124
191
|
|
|
125
192
|
# hard stop on secrets: never write to disk
|
|
126
|
-
if key
|
|
193
|
+
if _is_secret_key(key):
|
|
127
194
|
logger.warning(
|
|
128
|
-
|
|
195
|
+
"%s is a secret setting, refusing to persist. "
|
|
196
|
+
"Keep your secrets in .env or .env.local instead.",
|
|
197
|
+
_env_key_for_legacy_enum(key),
|
|
129
198
|
)
|
|
130
199
|
return
|
|
131
200
|
|
|
@@ -170,16 +239,17 @@ class KeyFileHandler:
|
|
|
170
239
|
# Deprecation: warn only if we're actually returning a secret
|
|
171
240
|
if (
|
|
172
241
|
value is not None
|
|
173
|
-
and key
|
|
174
|
-
and key
|
|
242
|
+
and _is_secret_key(key)
|
|
243
|
+
and _env_key_for_legacy_enum(key) not in _WARNED_SECRET_KEYS
|
|
175
244
|
):
|
|
176
245
|
logger.warning(
|
|
177
|
-
|
|
178
|
-
"
|
|
179
|
-
|
|
180
|
-
|
|
246
|
+
"Reading secret '%s' from legacy %s/%s. Persisting API keys in plaintext is deprecated. "
|
|
247
|
+
"Move this to your environment (.env / .env.local). This fallback will be removed in a future release.",
|
|
248
|
+
_env_key_for_legacy_enum(key),
|
|
249
|
+
HIDDEN_DIR,
|
|
250
|
+
KEY_FILE,
|
|
181
251
|
)
|
|
182
|
-
_WARNED_SECRET_KEYS.add(key
|
|
252
|
+
_WARNED_SECRET_KEYS.add(_env_key_for_legacy_enum(key))
|
|
183
253
|
|
|
184
254
|
return value
|
|
185
255
|
|
deepeval/metrics/base_metric.py
CHANGED
|
@@ -49,8 +49,10 @@ class BaseMetric:
|
|
|
49
49
|
return "Base Metric"
|
|
50
50
|
|
|
51
51
|
def _accrue_cost(self, cost: float) -> None:
|
|
52
|
-
if self.evaluation_cost is not None:
|
|
52
|
+
if self.evaluation_cost is not None and cost is not None:
|
|
53
53
|
self.evaluation_cost += cost
|
|
54
|
+
else:
|
|
55
|
+
self.evaluation_cost = None
|
|
54
56
|
|
|
55
57
|
|
|
56
58
|
class BaseConversationalMetric:
|
|
@@ -94,8 +96,10 @@ class BaseConversationalMetric:
|
|
|
94
96
|
return "Base Conversational Metric"
|
|
95
97
|
|
|
96
98
|
def _accrue_cost(self, cost: float) -> None:
|
|
97
|
-
if self.evaluation_cost is not None:
|
|
99
|
+
if self.evaluation_cost is not None and cost is not None:
|
|
98
100
|
self.evaluation_cost += cost
|
|
101
|
+
else:
|
|
102
|
+
self.evaluation_cost = None
|
|
99
103
|
|
|
100
104
|
|
|
101
105
|
class BaseArenaMetric:
|
|
@@ -129,5 +133,7 @@ class BaseArenaMetric:
|
|
|
129
133
|
return "Base Arena Metric"
|
|
130
134
|
|
|
131
135
|
def _accrue_cost(self, cost: float) -> None:
|
|
132
|
-
if self.evaluation_cost is not None:
|
|
136
|
+
if self.evaluation_cost is not None and cost is not None:
|
|
133
137
|
self.evaluation_cost += cost
|
|
138
|
+
else:
|
|
139
|
+
self.evaluation_cost = None
|
|
@@ -110,10 +110,15 @@ class GEval(BaseMetric):
|
|
|
110
110
|
_in_component=_in_component,
|
|
111
111
|
_additional_context=_additional_context,
|
|
112
112
|
)
|
|
113
|
+
settings = get_settings()
|
|
113
114
|
loop.run_until_complete(
|
|
114
115
|
asyncio.wait_for(
|
|
115
116
|
coro,
|
|
116
|
-
timeout=
|
|
117
|
+
timeout=(
|
|
118
|
+
None
|
|
119
|
+
if settings.DEEPEVAL_DISABLE_TIMEOUTS
|
|
120
|
+
else settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
121
|
+
),
|
|
117
122
|
)
|
|
118
123
|
)
|
|
119
124
|
else:
|
deepeval/metrics/indicator.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
1
5
|
from rich.console import Console
|
|
2
6
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
3
7
|
from contextlib import contextmanager
|
|
4
|
-
import sys
|
|
5
8
|
from typing import List, Optional, Union
|
|
6
|
-
import time
|
|
7
|
-
import asyncio
|
|
8
9
|
|
|
9
10
|
from deepeval.errors import MissingTestCaseParamsError
|
|
10
11
|
from deepeval.metrics import (
|
|
@@ -16,8 +17,8 @@ from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
|
16
17
|
from deepeval.test_run.cache import CachedTestCase, Cache
|
|
17
18
|
from deepeval.telemetry import capture_metric_type
|
|
18
19
|
from deepeval.utils import update_pbar
|
|
20
|
+
from deepeval.config.settings import get_settings
|
|
19
21
|
|
|
20
|
-
import logging
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
23
24
|
|
|
@@ -260,6 +261,9 @@ async def safe_a_measure(
|
|
|
260
261
|
"Timed out/cancelled while evaluating metric. "
|
|
261
262
|
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
262
263
|
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
264
|
+
if not get_settings().DEEPEVAL_DISABLE_TIMEOUTS
|
|
265
|
+
else "Cancelled while evaluating metric (DeepEval timeouts are disabled; this likely came from upstream orchestration or the provider/network layer). "
|
|
266
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
263
267
|
)
|
|
264
268
|
metric.success = False
|
|
265
269
|
|
|
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
15
15
|
from deepeval.test_case import ConversationalTestCase, TurnParams
|
|
16
16
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
17
|
-
from deepeval.metrics.mcp.schema import Task, TaskScore
|
|
17
|
+
from deepeval.metrics.mcp.schema import Task, TaskScore, Reason
|
|
18
18
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
19
19
|
from deepeval.errors import MissingTestCaseParamsError
|
|
20
20
|
from deepeval.metrics.api import metric_data_manager
|
|
@@ -171,14 +171,13 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
171
171
|
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
172
172
|
self.score, self.success, reasons
|
|
173
173
|
)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
return res
|
|
174
|
+
return generate_with_schema_and_extract(
|
|
175
|
+
metric=self,
|
|
176
|
+
prompt=prompt,
|
|
177
|
+
schema_cls=Reason,
|
|
178
|
+
extract_schema=lambda s: s.reason,
|
|
179
|
+
extract_json=lambda data: data["reason"],
|
|
180
|
+
)
|
|
182
181
|
|
|
183
182
|
async def _a_generate_reason(
|
|
184
183
|
self, task_scores: List[TaskScore]
|
|
@@ -194,13 +193,13 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
194
193
|
self.score, self.success, reasons
|
|
195
194
|
)
|
|
196
195
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
196
|
+
return await a_generate_with_schema_and_extract(
|
|
197
|
+
metric=self,
|
|
198
|
+
prompt=prompt,
|
|
199
|
+
schema_cls=Reason,
|
|
200
|
+
extract_schema=lambda s: s.reason,
|
|
201
|
+
extract_json=lambda data: data["reason"],
|
|
202
|
+
)
|
|
204
203
|
|
|
205
204
|
def _get_task_score(self, task: Task) -> TaskScore:
|
|
206
205
|
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
15
15
|
from deepeval.test_case import ConversationalTestCase, TurnParams
|
|
16
16
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
17
|
-
from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
|
|
17
|
+
from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore, Reason
|
|
18
18
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
19
19
|
from deepeval.errors import MissingTestCaseParamsError
|
|
20
20
|
from deepeval.metrics.api import metric_data_manager
|
|
@@ -336,13 +336,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
336
336
|
self.score, self.success, reasons
|
|
337
337
|
)
|
|
338
338
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
339
|
+
return generate_with_schema_and_extract(
|
|
340
|
+
metric=self,
|
|
341
|
+
prompt=prompt,
|
|
342
|
+
schema_cls=Reason,
|
|
343
|
+
extract_schema=lambda s: s.reason,
|
|
344
|
+
extract_json=lambda data: data["reason"],
|
|
345
|
+
)
|
|
346
346
|
|
|
347
347
|
async def _a_generate_reason(
|
|
348
348
|
self,
|
|
@@ -363,13 +363,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
363
363
|
self.score, self.success, reasons
|
|
364
364
|
)
|
|
365
365
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
366
|
+
return await a_generate_with_schema_and_extract(
|
|
367
|
+
metric=self,
|
|
368
|
+
prompt=prompt,
|
|
369
|
+
schema_cls=Reason,
|
|
370
|
+
extract_schema=lambda s: s.reason,
|
|
371
|
+
extract_json=lambda data: data["reason"],
|
|
372
|
+
)
|
|
373
373
|
|
|
374
374
|
def is_successful(self) -> bool:
|
|
375
375
|
if self.error is not None:
|
deepeval/metrics/mcp/schema.py
CHANGED
deepeval/metrics/mcp/template.py
CHANGED
|
@@ -148,6 +148,13 @@ JSON:
|
|
|
148
148
|
Context:
|
|
149
149
|
The reasons are from metrics that were used to evaluate an MCP application by determining whether the model accurately completed a task or called toos and resources with the right arguments.
|
|
150
150
|
|
|
151
|
+
**
|
|
152
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
153
|
+
Example JSON:
|
|
154
|
+
{{
|
|
155
|
+
"reason": "The score is <score> because <your_reason>."
|
|
156
|
+
}}
|
|
157
|
+
|
|
151
158
|
Inputs:
|
|
152
159
|
- final_score: the averaged score across all interactions.
|
|
153
160
|
- success: whether the metric passed or failed
|
|
@@ -173,5 +180,5 @@ JSON:
|
|
|
173
180
|
|
|
174
181
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
175
182
|
|
|
176
|
-
|
|
183
|
+
JSON:
|
|
177
184
|
"""
|
|
@@ -2,7 +2,11 @@ import asyncio
|
|
|
2
2
|
|
|
3
3
|
from typing import Optional, List, Union
|
|
4
4
|
|
|
5
|
-
from deepeval.utils import
|
|
5
|
+
from deepeval.utils import (
|
|
6
|
+
get_or_create_event_loop,
|
|
7
|
+
prettify_list,
|
|
8
|
+
get_per_task_timeout,
|
|
9
|
+
)
|
|
6
10
|
from deepeval.metrics.utils import (
|
|
7
11
|
construct_verbose_logs,
|
|
8
12
|
check_llm_test_case_params,
|
|
@@ -19,7 +23,6 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
19
23
|
from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
|
|
20
24
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
25
|
from deepeval.metrics.prompt_alignment import schema as paschema
|
|
22
|
-
from deepeval.config.settings import get_settings
|
|
23
26
|
|
|
24
27
|
from deepeval.metrics.api import metric_data_manager
|
|
25
28
|
|
|
@@ -86,7 +89,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
86
89
|
loop.run_until_complete(
|
|
87
90
|
asyncio.wait_for(
|
|
88
91
|
coro,
|
|
89
|
-
timeout=
|
|
92
|
+
timeout=get_per_task_timeout(),
|
|
90
93
|
)
|
|
91
94
|
)
|
|
92
95
|
else:
|
|
@@ -161,6 +161,13 @@ class ToolUseTemplate:
|
|
|
161
161
|
- The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
|
|
162
162
|
- A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
|
|
163
163
|
|
|
164
|
+
**
|
|
165
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
166
|
+
Example JSON:
|
|
167
|
+
{{
|
|
168
|
+
"reason": "The score is <score> because <your_reason>."
|
|
169
|
+
}}
|
|
170
|
+
|
|
164
171
|
RULES:
|
|
165
172
|
- Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
|
|
166
173
|
- Mention specific issues or strengths like redundancy, misuse, or perfect matching.
|
|
@@ -178,7 +185,7 @@ class ToolUseTemplate:
|
|
|
178
185
|
Threshold: {threshold}
|
|
179
186
|
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
180
187
|
|
|
181
|
-
|
|
188
|
+
JSON:
|
|
182
189
|
"""
|
|
183
190
|
)
|
|
184
191
|
|
|
@@ -199,6 +206,13 @@ class ToolUseTemplate:
|
|
|
199
206
|
- The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
|
|
200
207
|
- Whether the agent met or fell short of the threshold and why.
|
|
201
208
|
|
|
209
|
+
**
|
|
210
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
211
|
+
Example JSON:
|
|
212
|
+
{{
|
|
213
|
+
"reason": "The score is <score> because <your_reason>."
|
|
214
|
+
}}
|
|
215
|
+
|
|
202
216
|
RULES:
|
|
203
217
|
- Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
|
|
204
218
|
- Reference specific argument-level problems or successes where helpful.
|
|
@@ -215,6 +229,6 @@ class ToolUseTemplate:
|
|
|
215
229
|
Threshold: {threshold}
|
|
216
230
|
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
217
231
|
|
|
218
|
-
|
|
232
|
+
JSON:
|
|
219
233
|
"""
|
|
220
234
|
)
|
|
@@ -23,6 +23,7 @@ from deepeval.metrics.tool_use.schema import (
|
|
|
23
23
|
ToolSelectionScore,
|
|
24
24
|
UserInputAndTools,
|
|
25
25
|
ArgumentCorrectnessScore,
|
|
26
|
+
Reason,
|
|
26
27
|
)
|
|
27
28
|
from deepeval.metrics.api import metric_data_manager
|
|
28
29
|
|
|
@@ -356,13 +357,14 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
356
357
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
357
358
|
scores_and_reasons, self.score, self.threshold
|
|
358
359
|
)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
self
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
360
|
+
|
|
361
|
+
return generate_with_schema_and_extract(
|
|
362
|
+
metric=self,
|
|
363
|
+
prompt=prompt,
|
|
364
|
+
schema_cls=Reason,
|
|
365
|
+
extract_schema=lambda s: s.reason,
|
|
366
|
+
extract_json=lambda data: data["reason"],
|
|
367
|
+
)
|
|
366
368
|
|
|
367
369
|
def _generate_reason_for_argument_correctness(
|
|
368
370
|
self,
|
|
@@ -376,13 +378,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
376
378
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
377
379
|
scores_and_reasons, self.score, self.threshold
|
|
378
380
|
)
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
381
|
+
return generate_with_schema_and_extract(
|
|
382
|
+
metric=self,
|
|
383
|
+
prompt=prompt,
|
|
384
|
+
schema_cls=Reason,
|
|
385
|
+
extract_schema=lambda s: s.reason,
|
|
386
|
+
extract_json=lambda data: data["reason"],
|
|
387
|
+
)
|
|
386
388
|
|
|
387
389
|
async def _a_generate_reason_for_tool_selection(
|
|
388
390
|
self, tool_use_scores: List[ToolSelectionScore]
|
|
@@ -395,13 +397,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
395
397
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
396
398
|
scores_and_reasons, self.score, self.threshold
|
|
397
399
|
)
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
400
|
+
return await a_generate_with_schema_and_extract(
|
|
401
|
+
metric=self,
|
|
402
|
+
prompt=prompt,
|
|
403
|
+
schema_cls=Reason,
|
|
404
|
+
extract_schema=lambda s: s.reason,
|
|
405
|
+
extract_json=lambda data: data["reason"],
|
|
406
|
+
)
|
|
405
407
|
|
|
406
408
|
async def _a_generate_reason_for_argument_correctness(
|
|
407
409
|
self, argument_correctness_scores: List[ArgumentCorrectnessScore]
|
|
@@ -414,13 +416,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
414
416
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
415
417
|
scores_and_reasons, self.score, self.threshold
|
|
416
418
|
)
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
419
|
+
return await a_generate_with_schema_and_extract(
|
|
420
|
+
metric=self,
|
|
421
|
+
prompt=prompt,
|
|
422
|
+
schema_cls=Reason,
|
|
423
|
+
extract_schema=lambda s: s.reason,
|
|
424
|
+
extract_json=lambda data: data["reason"],
|
|
425
|
+
)
|
|
424
426
|
|
|
425
427
|
def is_successful(self) -> bool:
|
|
426
428
|
try:
|
|
@@ -149,6 +149,13 @@ class TopicAdherenceTemplate:
|
|
|
149
149
|
|
|
150
150
|
Your task is to go through these reasons and give a single final explaination that clearly explains why this metric has failed or passed.
|
|
151
151
|
|
|
152
|
+
**
|
|
153
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
154
|
+
Example JSON:
|
|
155
|
+
{{
|
|
156
|
+
"reason": "The score is <score> because <your_reason>."
|
|
157
|
+
}}
|
|
158
|
+
|
|
152
159
|
{TopicAdherenceTemplate.multimodal_rules}
|
|
153
160
|
|
|
154
161
|
Pass: {success}
|
|
@@ -170,6 +177,6 @@ class TopicAdherenceTemplate:
|
|
|
170
177
|
|
|
171
178
|
Output ONLY the reason, DON"T output anything else.
|
|
172
179
|
|
|
173
|
-
|
|
180
|
+
JSON:
|
|
174
181
|
"""
|
|
175
182
|
)
|