deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/utils.py
CHANGED
|
@@ -11,7 +11,6 @@ from deepeval.metrics import (
|
|
|
11
11
|
ArenaGEval,
|
|
12
12
|
BaseMetric,
|
|
13
13
|
BaseConversationalMetric,
|
|
14
|
-
BaseMultimodalMetric,
|
|
15
14
|
)
|
|
16
15
|
from deepeval.test_case import (
|
|
17
16
|
LLMTestCase,
|
|
@@ -218,9 +217,9 @@ def validate_assert_test_inputs(
|
|
|
218
217
|
)
|
|
219
218
|
|
|
220
219
|
if test_case and metrics:
|
|
221
|
-
if (
|
|
222
|
-
isinstance(
|
|
223
|
-
)
|
|
220
|
+
if (isinstance(test_case, LLMTestCase)) and not all(
|
|
221
|
+
isinstance(metric, BaseMetric) for metric in metrics
|
|
222
|
+
):
|
|
224
223
|
raise ValueError(
|
|
225
224
|
"All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
|
|
226
225
|
)
|
|
@@ -230,18 +229,6 @@ def validate_assert_test_inputs(
|
|
|
230
229
|
raise ValueError(
|
|
231
230
|
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
232
231
|
)
|
|
233
|
-
if (
|
|
234
|
-
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
235
|
-
) and not all(
|
|
236
|
-
(
|
|
237
|
-
isinstance(metric, BaseMultimodalMetric)
|
|
238
|
-
or isinstance(metric, BaseMetric)
|
|
239
|
-
)
|
|
240
|
-
for metric in metrics
|
|
241
|
-
):
|
|
242
|
-
raise ValueError(
|
|
243
|
-
"All 'metrics' for multi-modal LLMTestCase must be instances of 'BaseMultimodalMetric' only."
|
|
244
|
-
)
|
|
245
232
|
|
|
246
233
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
247
234
|
raise ValueError(
|
|
@@ -259,7 +246,6 @@ def validate_evaluate_inputs(
|
|
|
259
246
|
Union[
|
|
260
247
|
List[BaseMetric],
|
|
261
248
|
List[BaseConversationalMetric],
|
|
262
|
-
List[BaseMultimodalMetric],
|
|
263
249
|
]
|
|
264
250
|
] = None,
|
|
265
251
|
metric_collection: Optional[str] = None,
|
|
@@ -292,10 +278,9 @@ def validate_evaluate_inputs(
|
|
|
292
278
|
if test_cases and metrics:
|
|
293
279
|
for test_case in test_cases:
|
|
294
280
|
for metric in metrics:
|
|
295
|
-
if (
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
) and not isinstance(metric, BaseMetric):
|
|
281
|
+
if (isinstance(test_case, LLMTestCase)) and not isinstance(
|
|
282
|
+
metric, BaseMetric
|
|
283
|
+
):
|
|
299
284
|
raise ValueError(
|
|
300
285
|
f"Metric {metric.__name__} is not a valid metric for LLMTestCase."
|
|
301
286
|
)
|
|
@@ -306,15 +291,6 @@ def validate_evaluate_inputs(
|
|
|
306
291
|
raise ValueError(
|
|
307
292
|
f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
|
|
308
293
|
)
|
|
309
|
-
if (
|
|
310
|
-
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
311
|
-
) and not (
|
|
312
|
-
isinstance(metric, BaseMultimodalMetric)
|
|
313
|
-
or isinstance(metric, BaseMetric)
|
|
314
|
-
):
|
|
315
|
-
raise ValueError(
|
|
316
|
-
f"Metric {metric.__name__} is not a valid metric for multi-modal LLMTestCase."
|
|
317
|
-
)
|
|
318
294
|
|
|
319
295
|
|
|
320
296
|
def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
|
deepeval/key_handler.py
CHANGED
|
@@ -5,7 +5,9 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
|
|
7
7
|
from enum import Enum
|
|
8
|
-
from
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
from pydantic import SecretStr
|
|
10
|
+
from typing import get_args, get_origin, Union
|
|
9
11
|
|
|
10
12
|
from .constants import KEY_FILE, HIDDEN_DIR
|
|
11
13
|
|
|
@@ -13,26 +15,34 @@ from .constants import KEY_FILE, HIDDEN_DIR
|
|
|
13
15
|
logger = logging.getLogger(__name__)
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
18
|
+
@lru_cache(maxsize=1)
|
|
19
|
+
def _secret_env_keys() -> frozenset[str]:
|
|
20
|
+
# Lazy import avoids cycles at import time
|
|
21
|
+
from deepeval.config.settings import Settings
|
|
22
|
+
|
|
23
|
+
secret_keys: set[str] = set()
|
|
24
|
+
for env_key, field in Settings.model_fields.items():
|
|
25
|
+
ann = field.annotation
|
|
26
|
+
if ann is SecretStr:
|
|
27
|
+
secret_keys.add(env_key)
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
origin = get_origin(ann)
|
|
31
|
+
if origin is Union and any(a is SecretStr for a in get_args(ann)):
|
|
32
|
+
secret_keys.add(env_key)
|
|
33
|
+
|
|
34
|
+
return frozenset(secret_keys)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _env_key_for_legacy_enum(key) -> str:
|
|
38
|
+
# For ModelKeyValues, .name == .value, for KeyValues it's the important one:
|
|
39
|
+
# KeyValues.API_KEY.name == "API_KEY" (matches Settings), value == "api_key" (legacy json key)
|
|
40
|
+
return getattr(key, "name", str(key))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _is_secret_key(key) -> bool:
|
|
44
|
+
return _env_key_for_legacy_enum(key) in _secret_env_keys()
|
|
45
|
+
|
|
36
46
|
|
|
37
47
|
_WARNED_SECRET_KEYS = set()
|
|
38
48
|
|
|
@@ -40,7 +50,10 @@ _WARNED_SECRET_KEYS = set()
|
|
|
40
50
|
class KeyValues(Enum):
|
|
41
51
|
# Confident AI
|
|
42
52
|
API_KEY = "api_key"
|
|
53
|
+
CONFIDENT_API_KEY = "confident_api_key"
|
|
54
|
+
CONFIDENT_BASE_URL = "confident_base_url"
|
|
43
55
|
CONFIDENT_REGION = "confident_region"
|
|
56
|
+
|
|
44
57
|
# Cache
|
|
45
58
|
LAST_TEST_RUN_LINK = "last_test_run_link"
|
|
46
59
|
LAST_TEST_RUN_DATA = "last_test_run_data"
|
|
@@ -49,6 +62,24 @@ class KeyValues(Enum):
|
|
|
49
62
|
class ModelKeyValues(Enum):
|
|
50
63
|
# General
|
|
51
64
|
TEMPERATURE = "TEMPERATURE"
|
|
65
|
+
|
|
66
|
+
# Anthropic
|
|
67
|
+
USE_ANTHROPIC_MODEL = "USE_ANTHROPIC_MODEL"
|
|
68
|
+
ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
|
|
69
|
+
ANTHROPIC_MODEL_NAME = "ANTHROPIC_MODEL_NAME"
|
|
70
|
+
ANTHROPIC_COST_PER_INPUT_TOKEN = "ANTHROPIC_COST_PER_INPUT_TOKEN"
|
|
71
|
+
ANTHROPIC_COST_PER_OUTPUT_TOKEN = "ANTHROPIC_COST_PER_OUTPUT_TOKEN"
|
|
72
|
+
|
|
73
|
+
# AWS
|
|
74
|
+
AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID"
|
|
75
|
+
AWS_SECRET_ACCESS_KEY = "AWS_SECRET_ACCESS_KEY"
|
|
76
|
+
# AWS Bedrock
|
|
77
|
+
USE_AWS_BEDROCK_MODEL = "USE_AWS_BEDROCK_MODEL"
|
|
78
|
+
AWS_BEDROCK_MODEL_NAME = "AWS_BEDROCK_MODEL_NAME"
|
|
79
|
+
AWS_BEDROCK_REGION = "AWS_BEDROCK_REGION"
|
|
80
|
+
AWS_BEDROCK_COST_PER_INPUT_TOKEN = "AWS_BEDROCK_COST_PER_INPUT_TOKEN"
|
|
81
|
+
AWS_BEDROCK_COST_PER_OUTPUT_TOKEN = "AWS_BEDROCK_COST_PER_OUTPUT_TOKEN"
|
|
82
|
+
|
|
52
83
|
# Azure Open AI
|
|
53
84
|
AZURE_OPENAI_API_KEY = "AZURE_OPENAI_API_KEY"
|
|
54
85
|
AZURE_OPENAI_ENDPOINT = "AZURE_OPENAI_ENDPOINT"
|
|
@@ -57,49 +88,88 @@ class ModelKeyValues(Enum):
|
|
|
57
88
|
AZURE_MODEL_NAME = "AZURE_MODEL_NAME"
|
|
58
89
|
AZURE_MODEL_VERSION = "AZURE_MODEL_VERSION"
|
|
59
90
|
USE_AZURE_OPENAI = "USE_AZURE_OPENAI"
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
91
|
+
|
|
92
|
+
# DeepSeek
|
|
93
|
+
USE_DEEPSEEK_MODEL = "USE_DEEPSEEK_MODEL"
|
|
94
|
+
DEEPSEEK_API_KEY = "DEEPSEEK_API_KEY"
|
|
95
|
+
DEEPSEEK_MODEL_NAME = "DEEPSEEK_MODEL_NAME"
|
|
96
|
+
DEEPSEEK_COST_PER_INPUT_TOKEN = "DEEPSEEK_COST_PER_INPUT_TOKEN"
|
|
97
|
+
DEEPSEEK_COST_PER_OUTPUT_TOKEN = "DEEPSEEK_COST_PER_OUTPUT_TOKEN"
|
|
98
|
+
|
|
66
99
|
# Gemini
|
|
67
100
|
USE_GEMINI_MODEL = "USE_GEMINI_MODEL"
|
|
68
|
-
GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
|
|
69
101
|
GOOGLE_API_KEY = "GOOGLE_API_KEY"
|
|
102
|
+
GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
|
|
70
103
|
GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
|
|
71
104
|
GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
|
|
72
105
|
GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
|
|
73
106
|
GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
|
|
107
|
+
|
|
108
|
+
# Grok
|
|
109
|
+
USE_GROK_MODEL = "USE_GROK_MODEL"
|
|
110
|
+
GROK_API_KEY = "GROK_API_KEY"
|
|
111
|
+
GROK_MODEL_NAME = "GROK_MODEL_NAME"
|
|
112
|
+
GROK_COST_PER_INPUT_TOKEN = "GROK_COST_PER_INPUT_TOKEN"
|
|
113
|
+
GROK_COST_PER_OUTPUT_TOKEN = "GROK_COST_PER_OUTPUT_TOKEN"
|
|
114
|
+
|
|
74
115
|
# LiteLLM
|
|
75
116
|
USE_LITELLM = "USE_LITELLM"
|
|
76
|
-
LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
|
|
77
117
|
LITELLM_API_KEY = "LITELLM_API_KEY"
|
|
118
|
+
LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
|
|
78
119
|
LITELLM_API_BASE = "LITELLM_API_BASE"
|
|
120
|
+
LITELLM_PROXY_API_BASE = "LITELLM_PROXY_API_BASE"
|
|
121
|
+
LITELLM_PROXY_API_KEY = "LITELLM_PROXY_API_KEY"
|
|
122
|
+
|
|
123
|
+
# LM Studio
|
|
124
|
+
LM_STUDIO_API_KEY = "LM_STUDIO_API_KEY"
|
|
125
|
+
LM_STUDIO_MODEL_NAME = "LM_STUDIO_MODEL_NAME"
|
|
126
|
+
|
|
127
|
+
# Local Model
|
|
128
|
+
USE_LOCAL_MODEL = "USE_LOCAL_MODEL"
|
|
129
|
+
LOCAL_MODEL_API_KEY = "LOCAL_MODEL_API_KEY"
|
|
130
|
+
LOCAL_MODEL_NAME = "LOCAL_MODEL_NAME"
|
|
131
|
+
LOCAL_MODEL_BASE_URL = "LOCAL_MODEL_BASE_URL"
|
|
132
|
+
LOCAL_MODEL_FORMAT = "LOCAL_MODEL_FORMAT"
|
|
133
|
+
|
|
134
|
+
# Moonshot
|
|
135
|
+
USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
|
|
136
|
+
MOONSHOT_API_KEY = "MOONSHOT_API_KEY"
|
|
137
|
+
MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
|
|
138
|
+
MOONSHOT_COST_PER_INPUT_TOKEN = "MOONSHOT_COST_PER_INPUT_TOKEN"
|
|
139
|
+
MOONSHOT_COST_PER_OUTPUT_TOKEN = "MOONSHOT_COST_PER_OUTPUT_TOKEN"
|
|
140
|
+
|
|
141
|
+
# Ollama
|
|
142
|
+
OLLAMA_MODEL_NAME = "OLLAMA_MODEL_NAME"
|
|
143
|
+
|
|
79
144
|
# OpenAI
|
|
80
145
|
USE_OPENAI_MODEL = "USE_OPENAI_MODEL"
|
|
146
|
+
OPENAI_API_KEY = "OPENAI_API_KEY"
|
|
81
147
|
OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
|
|
82
148
|
OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
|
|
83
149
|
OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
|
|
84
|
-
|
|
85
|
-
#
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
150
|
+
|
|
151
|
+
# PortKey
|
|
152
|
+
USE_PORTKEY_MODEL = "USE_PORTKEY_MODEL"
|
|
153
|
+
PORTKEY_API_KEY = "PORTKEY_API_KEY"
|
|
154
|
+
PORTKEY_MODEL_NAME = "PORTKEY_MODEL_NAME"
|
|
155
|
+
PORTKEY_BASE_URL = "PORTKEY_BASE_URL"
|
|
156
|
+
PORTKEY_PROVIDER_NAME = "PORTKEY_PROVIDER_NAME"
|
|
157
|
+
|
|
158
|
+
# Vertex AI
|
|
159
|
+
VERTEX_AI_MODEL_NAME = "VERTEX_AI_MODEL_NAME"
|
|
160
|
+
|
|
161
|
+
# VLLM
|
|
162
|
+
VLLM_API_KEY = "VLLM_API_KEY"
|
|
163
|
+
VLLM_MODEL_NAME = "VLLM_MODEL_NAME"
|
|
97
164
|
|
|
98
165
|
|
|
99
166
|
class EmbeddingKeyValues(Enum):
|
|
100
167
|
# Azure OpenAI
|
|
101
168
|
USE_AZURE_OPENAI_EMBEDDING = "USE_AZURE_OPENAI_EMBEDDING"
|
|
169
|
+
# Azure OpenAI
|
|
170
|
+
AZURE_EMBEDDING_MODEL_NAME = "AZURE_EMBEDDING_MODEL_NAME"
|
|
102
171
|
AZURE_EMBEDDING_DEPLOYMENT_NAME = "AZURE_EMBEDDING_DEPLOYMENT_NAME"
|
|
172
|
+
|
|
103
173
|
# Local
|
|
104
174
|
USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
|
|
105
175
|
LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
|
|
@@ -120,9 +190,11 @@ class KeyFileHandler:
|
|
|
120
190
|
"""Appends or updates data in the hidden file"""
|
|
121
191
|
|
|
122
192
|
# hard stop on secrets: never write to disk
|
|
123
|
-
if key
|
|
193
|
+
if _is_secret_key(key):
|
|
124
194
|
logger.warning(
|
|
125
|
-
|
|
195
|
+
"%s is a secret setting, refusing to persist. "
|
|
196
|
+
"Keep your secrets in .env or .env.local instead.",
|
|
197
|
+
_env_key_for_legacy_enum(key),
|
|
126
198
|
)
|
|
127
199
|
return
|
|
128
200
|
|
|
@@ -167,16 +239,17 @@ class KeyFileHandler:
|
|
|
167
239
|
# Deprecation: warn only if we're actually returning a secret
|
|
168
240
|
if (
|
|
169
241
|
value is not None
|
|
170
|
-
and key
|
|
171
|
-
and key
|
|
242
|
+
and _is_secret_key(key)
|
|
243
|
+
and _env_key_for_legacy_enum(key) not in _WARNED_SECRET_KEYS
|
|
172
244
|
):
|
|
173
245
|
logger.warning(
|
|
174
|
-
|
|
175
|
-
"
|
|
176
|
-
|
|
177
|
-
|
|
246
|
+
"Reading secret '%s' from legacy %s/%s. Persisting API keys in plaintext is deprecated. "
|
|
247
|
+
"Move this to your environment (.env / .env.local). This fallback will be removed in a future release.",
|
|
248
|
+
_env_key_for_legacy_enum(key),
|
|
249
|
+
HIDDEN_DIR,
|
|
250
|
+
KEY_FILE,
|
|
178
251
|
)
|
|
179
|
-
_WARNED_SECRET_KEYS.add(key
|
|
252
|
+
_WARNED_SECRET_KEYS.add(_env_key_for_legacy_enum(key))
|
|
180
253
|
|
|
181
254
|
return value
|
|
182
255
|
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from .base_metric import (
|
|
2
2
|
BaseMetric,
|
|
3
3
|
BaseConversationalMetric,
|
|
4
|
-
BaseMultimodalMetric,
|
|
5
4
|
BaseArenaMetric,
|
|
6
5
|
)
|
|
7
6
|
|
|
@@ -65,7 +64,6 @@ from .multimodal_metrics import (
|
|
|
65
64
|
ImageCoherenceMetric,
|
|
66
65
|
ImageHelpfulnessMetric,
|
|
67
66
|
ImageReferenceMetric,
|
|
68
|
-
MultimodalGEval,
|
|
69
67
|
)
|
|
70
68
|
|
|
71
69
|
|
|
@@ -73,7 +71,6 @@ __all__ = [
|
|
|
73
71
|
# Base classes
|
|
74
72
|
"BaseMetric",
|
|
75
73
|
"BaseConversationalMetric",
|
|
76
|
-
"BaseMultimodalMetric",
|
|
77
74
|
"BaseArenaMetric",
|
|
78
75
|
# Non-LLM metrics
|
|
79
76
|
"ExactMatchMetric",
|
|
@@ -133,5 +130,4 @@ __all__ = [
|
|
|
133
130
|
"ImageCoherenceMetric",
|
|
134
131
|
"ImageHelpfulnessMetric",
|
|
135
132
|
"ImageReferenceMetric",
|
|
136
|
-
"MultimodalGEval",
|
|
137
133
|
]
|
|
@@ -6,17 +6,22 @@ from deepeval.utils import (
|
|
|
6
6
|
)
|
|
7
7
|
from deepeval.metrics.utils import (
|
|
8
8
|
construct_verbose_logs,
|
|
9
|
-
trimAndLoadJson,
|
|
10
9
|
check_llm_test_case_params,
|
|
11
|
-
check_mllm_test_case_params,
|
|
12
10
|
initialize_model,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
12
|
+
a_generate_with_schema_and_extract,
|
|
13
13
|
)
|
|
14
14
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
|
|
15
15
|
from deepeval.metrics import BaseMetric
|
|
16
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
17
|
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
|
|
18
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
|
-
from deepeval.metrics.answer_relevancy.schema import
|
|
19
|
+
from deepeval.metrics.answer_relevancy.schema import (
|
|
20
|
+
Statements,
|
|
21
|
+
AnswerRelevancyVerdict,
|
|
22
|
+
Verdicts,
|
|
23
|
+
AnswerRelevancyScoreReason,
|
|
24
|
+
)
|
|
20
25
|
from deepeval.metrics.api import metric_data_manager
|
|
21
26
|
|
|
22
27
|
|
|
@@ -55,13 +60,15 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
55
60
|
_log_metric_to_confident: bool = True,
|
|
56
61
|
) -> float:
|
|
57
62
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
check_llm_test_case_params(
|
|
64
|
+
test_case,
|
|
65
|
+
self._required_params,
|
|
66
|
+
None,
|
|
67
|
+
None,
|
|
68
|
+
self,
|
|
69
|
+
self.model,
|
|
70
|
+
test_case.multimodal,
|
|
71
|
+
)
|
|
65
72
|
|
|
66
73
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
67
74
|
with metric_progress_indicator(
|
|
@@ -82,13 +89,13 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
82
89
|
actual_output = test_case.actual_output
|
|
83
90
|
|
|
84
91
|
self.statements: List[str] = self._generate_statements(
|
|
85
|
-
actual_output, multimodal
|
|
92
|
+
actual_output, test_case.multimodal
|
|
86
93
|
)
|
|
87
94
|
self.verdicts: List[AnswerRelevancyVerdict] = (
|
|
88
|
-
self._generate_verdicts(input, multimodal)
|
|
95
|
+
self._generate_verdicts(input, test_case.multimodal)
|
|
89
96
|
)
|
|
90
97
|
self.score = self._calculate_score()
|
|
91
|
-
self.reason = self._generate_reason(input, multimodal)
|
|
98
|
+
self.reason = self._generate_reason(input, test_case.multimodal)
|
|
92
99
|
self.success = self.score >= self.threshold
|
|
93
100
|
self.verbose_logs = construct_verbose_logs(
|
|
94
101
|
self,
|
|
@@ -113,13 +120,15 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
113
120
|
_log_metric_to_confident: bool = True,
|
|
114
121
|
) -> float:
|
|
115
122
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
+
check_llm_test_case_params(
|
|
124
|
+
test_case,
|
|
125
|
+
self._required_params,
|
|
126
|
+
None,
|
|
127
|
+
None,
|
|
128
|
+
self,
|
|
129
|
+
self.model,
|
|
130
|
+
test_case.multimodal,
|
|
131
|
+
)
|
|
123
132
|
|
|
124
133
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
125
134
|
with metric_progress_indicator(
|
|
@@ -132,13 +141,15 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
132
141
|
actual_output = test_case.actual_output
|
|
133
142
|
|
|
134
143
|
self.statements: List[str] = await self._a_generate_statements(
|
|
135
|
-
actual_output, multimodal
|
|
144
|
+
actual_output, test_case.multimodal
|
|
136
145
|
)
|
|
137
146
|
self.verdicts: List[AnswerRelevancyVerdict] = (
|
|
138
|
-
await self._a_generate_verdicts(input, multimodal)
|
|
147
|
+
await self._a_generate_verdicts(input, test_case.multimodal)
|
|
139
148
|
)
|
|
140
149
|
self.score = self._calculate_score()
|
|
141
|
-
self.reason = await self._a_generate_reason(
|
|
150
|
+
self.reason = await self._a_generate_reason(
|
|
151
|
+
input, test_case.multimodal
|
|
152
|
+
)
|
|
142
153
|
self.success = self.score >= self.threshold
|
|
143
154
|
self.verbose_logs = construct_verbose_logs(
|
|
144
155
|
self,
|
|
@@ -170,22 +181,13 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
170
181
|
multimodal=multimodal,
|
|
171
182
|
)
|
|
172
183
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
try:
|
|
181
|
-
res: AnswerRelevancyScoreReason = await self.model.a_generate(
|
|
182
|
-
prompt=prompt, schema=AnswerRelevancyScoreReason
|
|
183
|
-
)
|
|
184
|
-
return res.reason
|
|
185
|
-
except TypeError:
|
|
186
|
-
res = await self.model.a_generate(prompt)
|
|
187
|
-
data = trimAndLoadJson(res, self)
|
|
188
|
-
return data["reason"]
|
|
184
|
+
return await a_generate_with_schema_and_extract(
|
|
185
|
+
metric=self,
|
|
186
|
+
prompt=prompt,
|
|
187
|
+
schema_cls=AnswerRelevancyScoreReason,
|
|
188
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
189
|
+
extract_json=lambda data: data["reason"],
|
|
190
|
+
)
|
|
189
191
|
|
|
190
192
|
def _generate_reason(self, input: str, multimodal: bool) -> str:
|
|
191
193
|
if self.include_reason is False:
|
|
@@ -203,22 +205,13 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
203
205
|
multimodal=multimodal,
|
|
204
206
|
)
|
|
205
207
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
try:
|
|
214
|
-
res: AnswerRelevancyScoreReason = self.model.generate(
|
|
215
|
-
prompt=prompt, schema=AnswerRelevancyScoreReason
|
|
216
|
-
)
|
|
217
|
-
return res.reason
|
|
218
|
-
except TypeError:
|
|
219
|
-
res = self.model.generate(prompt)
|
|
220
|
-
data = trimAndLoadJson(res, self)
|
|
221
|
-
return data["reason"]
|
|
208
|
+
return generate_with_schema_and_extract(
|
|
209
|
+
metric=self,
|
|
210
|
+
prompt=prompt,
|
|
211
|
+
schema_cls=AnswerRelevancyScoreReason,
|
|
212
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
213
|
+
extract_json=lambda data: data["reason"],
|
|
214
|
+
)
|
|
222
215
|
|
|
223
216
|
async def _a_generate_verdicts(
|
|
224
217
|
self, input: str, multimodal: bool
|
|
@@ -230,22 +223,15 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
230
223
|
input=input, statements=self.statements, multimodal=multimodal
|
|
231
224
|
)
|
|
232
225
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
return [item for item in res.verdicts]
|
|
243
|
-
except TypeError:
|
|
244
|
-
res = await self.model.a_generate(prompt)
|
|
245
|
-
data = trimAndLoadJson(res, self)
|
|
246
|
-
return [
|
|
247
|
-
AnswerRelevancyVerdict(**item) for item in data["verdicts"]
|
|
248
|
-
]
|
|
226
|
+
return await a_generate_with_schema_and_extract(
|
|
227
|
+
metric=self,
|
|
228
|
+
prompt=prompt,
|
|
229
|
+
schema_cls=Verdicts,
|
|
230
|
+
extract_schema=lambda r: list(r.verdicts),
|
|
231
|
+
extract_json=lambda data: [
|
|
232
|
+
AnswerRelevancyVerdict(**item) for item in data["verdicts"]
|
|
233
|
+
],
|
|
234
|
+
)
|
|
249
235
|
|
|
250
236
|
def _generate_verdicts(
|
|
251
237
|
self, input: str, multimodal: bool
|
|
@@ -257,22 +243,17 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
257
243
|
input=input, statements=self.statements, multimodal=multimodal
|
|
258
244
|
)
|
|
259
245
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
res = self.model.generate(prompt)
|
|
270
|
-
data = trimAndLoadJson(res, self)
|
|
271
|
-
return [
|
|
272
|
-
AnswerRelevancyVerdict(**item) for item in data["verdicts"]
|
|
273
|
-
]
|
|
246
|
+
return generate_with_schema_and_extract(
|
|
247
|
+
metric=self,
|
|
248
|
+
prompt=prompt,
|
|
249
|
+
schema_cls=Verdicts,
|
|
250
|
+
extract_schema=lambda r: list(r.verdicts),
|
|
251
|
+
extract_json=lambda data: [
|
|
252
|
+
AnswerRelevancyVerdict(**item) for item in data["verdicts"]
|
|
253
|
+
],
|
|
254
|
+
)
|
|
274
255
|
|
|
275
|
-
|
|
256
|
+
def _generate_statements(
|
|
276
257
|
self,
|
|
277
258
|
actual_output: str,
|
|
278
259
|
multimodal: bool,
|
|
@@ -280,31 +261,18 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
280
261
|
prompt = self.evaluation_template.generate_statements(
|
|
281
262
|
actual_output=actual_output, multimodal=multimodal
|
|
282
263
|
)
|
|
283
|
-
if self.using_native_model:
|
|
284
|
-
res, cost = await self.model.a_generate(prompt, schema=Statements)
|
|
285
|
-
self.evaluation_cost += cost
|
|
286
|
-
statements: List[str] = res.statements + [
|
|
287
|
-
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
288
|
-
]
|
|
289
|
-
return statements
|
|
290
|
-
else:
|
|
291
|
-
try:
|
|
292
|
-
res: Statements = await self.model.a_generate(
|
|
293
|
-
prompt, schema=Statements
|
|
294
|
-
)
|
|
295
|
-
statements: List[str] = res.statements + [
|
|
296
|
-
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
297
|
-
]
|
|
298
|
-
return statements
|
|
299
|
-
except TypeError:
|
|
300
|
-
res = await self.model.a_generate(prompt)
|
|
301
|
-
data = trimAndLoadJson(res, self)
|
|
302
|
-
statements = data["statements"] + [
|
|
303
|
-
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
304
|
-
]
|
|
305
|
-
return statements
|
|
306
264
|
|
|
307
|
-
|
|
265
|
+
return generate_with_schema_and_extract(
|
|
266
|
+
metric=self,
|
|
267
|
+
prompt=prompt,
|
|
268
|
+
schema_cls=Statements,
|
|
269
|
+
extract_schema=lambda s: s.statements
|
|
270
|
+
+ [ele for ele in actual_output if isinstance(ele, MLLMImage)],
|
|
271
|
+
extract_json=lambda d: d["statements"]
|
|
272
|
+
+ [ele for ele in actual_output if isinstance(ele, MLLMImage)],
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
async def _a_generate_statements(
|
|
308
276
|
self,
|
|
309
277
|
actual_output: str,
|
|
310
278
|
multimodal: bool,
|
|
@@ -312,27 +280,16 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
312
280
|
prompt = self.evaluation_template.generate_statements(
|
|
313
281
|
actual_output=actual_output, multimodal=multimodal
|
|
314
282
|
)
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
self
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
statements = res.statements + [
|
|
326
|
-
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
327
|
-
]
|
|
328
|
-
return statements
|
|
329
|
-
except TypeError:
|
|
330
|
-
res = self.model.generate(prompt)
|
|
331
|
-
data = trimAndLoadJson(res, self)
|
|
332
|
-
statements = data["statements"] + [
|
|
333
|
-
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
334
|
-
]
|
|
335
|
-
return statements
|
|
283
|
+
|
|
284
|
+
return await a_generate_with_schema_and_extract(
|
|
285
|
+
metric=self,
|
|
286
|
+
prompt=prompt,
|
|
287
|
+
schema_cls=Statements,
|
|
288
|
+
extract_schema=lambda s: s.statements
|
|
289
|
+
+ [ele for ele in actual_output if isinstance(ele, MLLMImage)],
|
|
290
|
+
extract_json=lambda d: d["statements"]
|
|
291
|
+
+ [ele for ele in actual_output if isinstance(ele, MLLMImage)],
|
|
292
|
+
)
|
|
336
293
|
|
|
337
294
|
def _calculate_score(self):
|
|
338
295
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -353,7 +310,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
353
310
|
else:
|
|
354
311
|
try:
|
|
355
312
|
self.success = self.score >= self.threshold
|
|
356
|
-
except:
|
|
313
|
+
except TypeError:
|
|
357
314
|
self.success = False
|
|
358
315
|
return self.success
|
|
359
316
|
|