deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -11,7 +11,6 @@ from deepeval.metrics import (
11
11
  ArenaGEval,
12
12
  BaseMetric,
13
13
  BaseConversationalMetric,
14
- BaseMultimodalMetric,
15
14
  )
16
15
  from deepeval.test_case import (
17
16
  LLMTestCase,
@@ -218,9 +217,9 @@ def validate_assert_test_inputs(
218
217
  )
219
218
 
220
219
  if test_case and metrics:
221
- if (
222
- isinstance(test_case, LLMTestCase) and not test_case.multimodal
223
- ) and not all(isinstance(metric, BaseMetric) for metric in metrics):
220
+ if (isinstance(test_case, LLMTestCase)) and not all(
221
+ isinstance(metric, BaseMetric) for metric in metrics
222
+ ):
224
223
  raise ValueError(
225
224
  "All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
226
225
  )
@@ -230,18 +229,6 @@ def validate_assert_test_inputs(
230
229
  raise ValueError(
231
230
  "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
232
231
  )
233
- if (
234
- isinstance(test_case, LLMTestCase) and test_case.multimodal
235
- ) and not all(
236
- (
237
- isinstance(metric, BaseMultimodalMetric)
238
- or isinstance(metric, BaseMetric)
239
- )
240
- for metric in metrics
241
- ):
242
- raise ValueError(
243
- "All 'metrics' for multi-modal LLMTestCase must be instances of 'BaseMultimodalMetric' only."
244
- )
245
232
 
246
233
  if not ((golden and observed_callback) or (test_case and metrics)):
247
234
  raise ValueError(
@@ -259,7 +246,6 @@ def validate_evaluate_inputs(
259
246
  Union[
260
247
  List[BaseMetric],
261
248
  List[BaseConversationalMetric],
262
- List[BaseMultimodalMetric],
263
249
  ]
264
250
  ] = None,
265
251
  metric_collection: Optional[str] = None,
@@ -292,10 +278,9 @@ def validate_evaluate_inputs(
292
278
  if test_cases and metrics:
293
279
  for test_case in test_cases:
294
280
  for metric in metrics:
295
- if (
296
- isinstance(test_case, LLMTestCase)
297
- and not test_case.multimodal
298
- ) and not isinstance(metric, BaseMetric):
281
+ if (isinstance(test_case, LLMTestCase)) and not isinstance(
282
+ metric, BaseMetric
283
+ ):
299
284
  raise ValueError(
300
285
  f"Metric {metric.__name__} is not a valid metric for LLMTestCase."
301
286
  )
@@ -306,15 +291,6 @@ def validate_evaluate_inputs(
306
291
  raise ValueError(
307
292
  f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
308
293
  )
309
- if (
310
- isinstance(test_case, LLMTestCase) and test_case.multimodal
311
- ) and not (
312
- isinstance(metric, BaseMultimodalMetric)
313
- or isinstance(metric, BaseMetric)
314
- ):
315
- raise ValueError(
316
- f"Metric {metric.__name__} is not a valid metric for multi-modal LLMTestCase."
317
- )
318
294
 
319
295
 
320
296
  def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
deepeval/key_handler.py CHANGED
@@ -5,7 +5,9 @@ import json
5
5
  import logging
6
6
 
7
7
  from enum import Enum
8
- from typing import Union
8
+ from functools import lru_cache
9
+ from pydantic import SecretStr
10
+ from typing import get_args, get_origin, Union
9
11
 
10
12
  from .constants import KEY_FILE, HIDDEN_DIR
11
13
 
@@ -13,26 +15,34 @@ from .constants import KEY_FILE, HIDDEN_DIR
13
15
  logger = logging.getLogger(__name__)
14
16
 
15
17
 
16
- SECRET_KEYS = {
17
- # General providers
18
- "OPENAI_API_KEY",
19
- "ANTHROPIC_API_KEY",
20
- # Azure OpenAI
21
- "AZURE_OPENAI_API_KEY",
22
- # Google / Gemini
23
- "GOOGLE_API_KEY",
24
- # xAI Grok
25
- "GROK_API_KEY",
26
- # Moonshot
27
- "MOONSHOT_API_KEY",
28
- # DeepSeek
29
- "DEEPSEEK_API_KEY",
30
- # LiteLLM
31
- "LITELLM_API_KEY",
32
- # Local gateways (if any require keys)
33
- "LOCAL_MODEL_API_KEY",
34
- "LOCAL_EMBEDDING_API_KEY",
35
- }
18
+ @lru_cache(maxsize=1)
19
+ def _secret_env_keys() -> frozenset[str]:
20
+ # Lazy import avoids cycles at import time
21
+ from deepeval.config.settings import Settings
22
+
23
+ secret_keys: set[str] = set()
24
+ for env_key, field in Settings.model_fields.items():
25
+ ann = field.annotation
26
+ if ann is SecretStr:
27
+ secret_keys.add(env_key)
28
+ continue
29
+
30
+ origin = get_origin(ann)
31
+ if origin is Union and any(a is SecretStr for a in get_args(ann)):
32
+ secret_keys.add(env_key)
33
+
34
+ return frozenset(secret_keys)
35
+
36
+
37
+ def _env_key_for_legacy_enum(key) -> str:
38
+ # For ModelKeyValues, .name == .value, for KeyValues it's the important one:
39
+ # KeyValues.API_KEY.name == "API_KEY" (matches Settings), value == "api_key" (legacy json key)
40
+ return getattr(key, "name", str(key))
41
+
42
+
43
+ def _is_secret_key(key) -> bool:
44
+ return _env_key_for_legacy_enum(key) in _secret_env_keys()
45
+
36
46
 
37
47
  _WARNED_SECRET_KEYS = set()
38
48
 
@@ -40,7 +50,10 @@ _WARNED_SECRET_KEYS = set()
40
50
  class KeyValues(Enum):
41
51
  # Confident AI
42
52
  API_KEY = "api_key"
53
+ CONFIDENT_API_KEY = "confident_api_key"
54
+ CONFIDENT_BASE_URL = "confident_base_url"
43
55
  CONFIDENT_REGION = "confident_region"
56
+
44
57
  # Cache
45
58
  LAST_TEST_RUN_LINK = "last_test_run_link"
46
59
  LAST_TEST_RUN_DATA = "last_test_run_data"
@@ -49,6 +62,24 @@ class KeyValues(Enum):
49
62
  class ModelKeyValues(Enum):
50
63
  # General
51
64
  TEMPERATURE = "TEMPERATURE"
65
+
66
+ # Anthropic
67
+ USE_ANTHROPIC_MODEL = "USE_ANTHROPIC_MODEL"
68
+ ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
69
+ ANTHROPIC_MODEL_NAME = "ANTHROPIC_MODEL_NAME"
70
+ ANTHROPIC_COST_PER_INPUT_TOKEN = "ANTHROPIC_COST_PER_INPUT_TOKEN"
71
+ ANTHROPIC_COST_PER_OUTPUT_TOKEN = "ANTHROPIC_COST_PER_OUTPUT_TOKEN"
72
+
73
+ # AWS
74
+ AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID"
75
+ AWS_SECRET_ACCESS_KEY = "AWS_SECRET_ACCESS_KEY"
76
+ # AWS Bedrock
77
+ USE_AWS_BEDROCK_MODEL = "USE_AWS_BEDROCK_MODEL"
78
+ AWS_BEDROCK_MODEL_NAME = "AWS_BEDROCK_MODEL_NAME"
79
+ AWS_BEDROCK_REGION = "AWS_BEDROCK_REGION"
80
+ AWS_BEDROCK_COST_PER_INPUT_TOKEN = "AWS_BEDROCK_COST_PER_INPUT_TOKEN"
81
+ AWS_BEDROCK_COST_PER_OUTPUT_TOKEN = "AWS_BEDROCK_COST_PER_OUTPUT_TOKEN"
82
+
52
83
  # Azure Open AI
53
84
  AZURE_OPENAI_API_KEY = "AZURE_OPENAI_API_KEY"
54
85
  AZURE_OPENAI_ENDPOINT = "AZURE_OPENAI_ENDPOINT"
@@ -57,49 +88,88 @@ class ModelKeyValues(Enum):
57
88
  AZURE_MODEL_NAME = "AZURE_MODEL_NAME"
58
89
  AZURE_MODEL_VERSION = "AZURE_MODEL_VERSION"
59
90
  USE_AZURE_OPENAI = "USE_AZURE_OPENAI"
60
- # Local Model
61
- LOCAL_MODEL_NAME = "LOCAL_MODEL_NAME"
62
- LOCAL_MODEL_BASE_URL = "LOCAL_MODEL_BASE_URL"
63
- LOCAL_MODEL_API_KEY = "LOCAL_MODEL_API_KEY"
64
- LOCAL_MODEL_FORMAT = "LOCAL_MODEL_FORMAT"
65
- USE_LOCAL_MODEL = "USE_LOCAL_MODEL"
91
+
92
+ # DeepSeek
93
+ USE_DEEPSEEK_MODEL = "USE_DEEPSEEK_MODEL"
94
+ DEEPSEEK_API_KEY = "DEEPSEEK_API_KEY"
95
+ DEEPSEEK_MODEL_NAME = "DEEPSEEK_MODEL_NAME"
96
+ DEEPSEEK_COST_PER_INPUT_TOKEN = "DEEPSEEK_COST_PER_INPUT_TOKEN"
97
+ DEEPSEEK_COST_PER_OUTPUT_TOKEN = "DEEPSEEK_COST_PER_OUTPUT_TOKEN"
98
+
66
99
  # Gemini
67
100
  USE_GEMINI_MODEL = "USE_GEMINI_MODEL"
68
- GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
69
101
  GOOGLE_API_KEY = "GOOGLE_API_KEY"
102
+ GEMINI_MODEL_NAME = "GEMINI_MODEL_NAME"
70
103
  GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
71
104
  GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
72
105
  GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
73
106
  GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
107
+
108
+ # Grok
109
+ USE_GROK_MODEL = "USE_GROK_MODEL"
110
+ GROK_API_KEY = "GROK_API_KEY"
111
+ GROK_MODEL_NAME = "GROK_MODEL_NAME"
112
+ GROK_COST_PER_INPUT_TOKEN = "GROK_COST_PER_INPUT_TOKEN"
113
+ GROK_COST_PER_OUTPUT_TOKEN = "GROK_COST_PER_OUTPUT_TOKEN"
114
+
74
115
  # LiteLLM
75
116
  USE_LITELLM = "USE_LITELLM"
76
- LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
77
117
  LITELLM_API_KEY = "LITELLM_API_KEY"
118
+ LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"
78
119
  LITELLM_API_BASE = "LITELLM_API_BASE"
120
+ LITELLM_PROXY_API_BASE = "LITELLM_PROXY_API_BASE"
121
+ LITELLM_PROXY_API_KEY = "LITELLM_PROXY_API_KEY"
122
+
123
+ # LM Studio
124
+ LM_STUDIO_API_KEY = "LM_STUDIO_API_KEY"
125
+ LM_STUDIO_MODEL_NAME = "LM_STUDIO_MODEL_NAME"
126
+
127
+ # Local Model
128
+ USE_LOCAL_MODEL = "USE_LOCAL_MODEL"
129
+ LOCAL_MODEL_API_KEY = "LOCAL_MODEL_API_KEY"
130
+ LOCAL_MODEL_NAME = "LOCAL_MODEL_NAME"
131
+ LOCAL_MODEL_BASE_URL = "LOCAL_MODEL_BASE_URL"
132
+ LOCAL_MODEL_FORMAT = "LOCAL_MODEL_FORMAT"
133
+
134
+ # Moonshot
135
+ USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
136
+ MOONSHOT_API_KEY = "MOONSHOT_API_KEY"
137
+ MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
138
+ MOONSHOT_COST_PER_INPUT_TOKEN = "MOONSHOT_COST_PER_INPUT_TOKEN"
139
+ MOONSHOT_COST_PER_OUTPUT_TOKEN = "MOONSHOT_COST_PER_OUTPUT_TOKEN"
140
+
141
+ # Ollama
142
+ OLLAMA_MODEL_NAME = "OLLAMA_MODEL_NAME"
143
+
79
144
  # OpenAI
80
145
  USE_OPENAI_MODEL = "USE_OPENAI_MODEL"
146
+ OPENAI_API_KEY = "OPENAI_API_KEY"
81
147
  OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
82
148
  OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
83
149
  OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
84
- OPENAI_API_KEY = "OPENAI_API_KEY"
85
- # Moonshot
86
- USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
87
- MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
88
- MOONSHOT_API_KEY = "MOONSHOT_API_KEY"
89
- # Grok
90
- USE_GROK_MODEL = "USE_GROK_MODEL"
91
- GROK_MODEL_NAME = "GROK_MODEL_NAME"
92
- GROK_API_KEY = "GROK_API_KEY"
93
- # DeepSeek
94
- USE_DEEPSEEK_MODEL = "USE_DEEPSEEK_MODEL"
95
- DEEPSEEK_MODEL_NAME = "DEEPSEEK_MODEL_NAME"
96
- DEEPSEEK_API_KEY = "DEEPSEEK_API_KEY"
150
+
151
+ # PortKey
152
+ USE_PORTKEY_MODEL = "USE_PORTKEY_MODEL"
153
+ PORTKEY_API_KEY = "PORTKEY_API_KEY"
154
+ PORTKEY_MODEL_NAME = "PORTKEY_MODEL_NAME"
155
+ PORTKEY_BASE_URL = "PORTKEY_BASE_URL"
156
+ PORTKEY_PROVIDER_NAME = "PORTKEY_PROVIDER_NAME"
157
+
158
+ # Vertex AI
159
+ VERTEX_AI_MODEL_NAME = "VERTEX_AI_MODEL_NAME"
160
+
161
+ # VLLM
162
+ VLLM_API_KEY = "VLLM_API_KEY"
163
+ VLLM_MODEL_NAME = "VLLM_MODEL_NAME"
97
164
 
98
165
 
99
166
  class EmbeddingKeyValues(Enum):
100
167
  # Azure OpenAI
101
168
  USE_AZURE_OPENAI_EMBEDDING = "USE_AZURE_OPENAI_EMBEDDING"
169
+ # Azure OpenAI
170
+ AZURE_EMBEDDING_MODEL_NAME = "AZURE_EMBEDDING_MODEL_NAME"
102
171
  AZURE_EMBEDDING_DEPLOYMENT_NAME = "AZURE_EMBEDDING_DEPLOYMENT_NAME"
172
+
103
173
  # Local
104
174
  USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
105
175
  LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
@@ -120,9 +190,11 @@ class KeyFileHandler:
120
190
  """Appends or updates data in the hidden file"""
121
191
 
122
192
  # hard stop on secrets: never write to disk
123
- if key.value in SECRET_KEYS:
193
+ if _is_secret_key(key):
124
194
  logger.warning(
125
- f"{key} is blacklisted, refusing to persist. Keep your secrets in .env or .env.local instead"
195
+ "%s is a secret setting, refusing to persist. "
196
+ "Keep your secrets in .env or .env.local instead.",
197
+ _env_key_for_legacy_enum(key),
126
198
  )
127
199
  return
128
200
 
@@ -167,16 +239,17 @@ class KeyFileHandler:
167
239
  # Deprecation: warn only if we're actually returning a secret
168
240
  if (
169
241
  value is not None
170
- and key.value in SECRET_KEYS
171
- and key.value not in _WARNED_SECRET_KEYS
242
+ and _is_secret_key(key)
243
+ and _env_key_for_legacy_enum(key) not in _WARNED_SECRET_KEYS
172
244
  ):
173
245
  logger.warning(
174
- f"Reading secret '{key.value}' from legacy {HIDDEN_DIR}/{KEY_FILE}. "
175
- "Persisting API keys in plaintext is deprecated. "
176
- "Move this to your environment (.env / .env.local). "
177
- "This fallback will be removed in a future release."
246
+ "Reading secret '%s' from legacy %s/%s. Persisting API keys in plaintext is deprecated. "
247
+ "Move this to your environment (.env / .env.local). This fallback will be removed in a future release.",
248
+ _env_key_for_legacy_enum(key),
249
+ HIDDEN_DIR,
250
+ KEY_FILE,
178
251
  )
179
- _WARNED_SECRET_KEYS.add(key.value)
252
+ _WARNED_SECRET_KEYS.add(_env_key_for_legacy_enum(key))
180
253
 
181
254
  return value
182
255
 
@@ -1,7 +1,6 @@
1
1
  from .base_metric import (
2
2
  BaseMetric,
3
3
  BaseConversationalMetric,
4
- BaseMultimodalMetric,
5
4
  BaseArenaMetric,
6
5
  )
7
6
 
@@ -65,7 +64,6 @@ from .multimodal_metrics import (
65
64
  ImageCoherenceMetric,
66
65
  ImageHelpfulnessMetric,
67
66
  ImageReferenceMetric,
68
- MultimodalGEval,
69
67
  )
70
68
 
71
69
 
@@ -73,7 +71,6 @@ __all__ = [
73
71
  # Base classes
74
72
  "BaseMetric",
75
73
  "BaseConversationalMetric",
76
- "BaseMultimodalMetric",
77
74
  "BaseArenaMetric",
78
75
  # Non-LLM metrics
79
76
  "ExactMatchMetric",
@@ -133,5 +130,4 @@ __all__ = [
133
130
  "ImageCoherenceMetric",
134
131
  "ImageHelpfulnessMetric",
135
132
  "ImageReferenceMetric",
136
- "MultimodalGEval",
137
133
  ]
@@ -6,17 +6,22 @@ from deepeval.utils import (
6
6
  )
7
7
  from deepeval.metrics.utils import (
8
8
  construct_verbose_logs,
9
- trimAndLoadJson,
10
9
  check_llm_test_case_params,
11
- check_mllm_test_case_params,
12
10
  initialize_model,
11
+ generate_with_schema_and_extract,
12
+ a_generate_with_schema_and_extract,
13
13
  )
14
14
  from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
15
15
  from deepeval.metrics import BaseMetric
16
16
  from deepeval.models import DeepEvalBaseLLM
17
17
  from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
18
18
  from deepeval.metrics.indicator import metric_progress_indicator
19
- from deepeval.metrics.answer_relevancy.schema import *
19
+ from deepeval.metrics.answer_relevancy.schema import (
20
+ Statements,
21
+ AnswerRelevancyVerdict,
22
+ Verdicts,
23
+ AnswerRelevancyScoreReason,
24
+ )
20
25
  from deepeval.metrics.api import metric_data_manager
21
26
 
22
27
 
@@ -55,13 +60,15 @@ class AnswerRelevancyMetric(BaseMetric):
55
60
  _log_metric_to_confident: bool = True,
56
61
  ) -> float:
57
62
 
58
- multimodal = test_case.multimodal
59
- if multimodal:
60
- check_mllm_test_case_params(
61
- test_case, self._required_params, None, None, self, self.model
62
- )
63
- else:
64
- check_llm_test_case_params(test_case, self._required_params, self)
63
+ check_llm_test_case_params(
64
+ test_case,
65
+ self._required_params,
66
+ None,
67
+ None,
68
+ self,
69
+ self.model,
70
+ test_case.multimodal,
71
+ )
65
72
 
66
73
  self.evaluation_cost = 0 if self.using_native_model else None
67
74
  with metric_progress_indicator(
@@ -82,13 +89,13 @@ class AnswerRelevancyMetric(BaseMetric):
82
89
  actual_output = test_case.actual_output
83
90
 
84
91
  self.statements: List[str] = self._generate_statements(
85
- actual_output, multimodal
92
+ actual_output, test_case.multimodal
86
93
  )
87
94
  self.verdicts: List[AnswerRelevancyVerdict] = (
88
- self._generate_verdicts(input, multimodal)
95
+ self._generate_verdicts(input, test_case.multimodal)
89
96
  )
90
97
  self.score = self._calculate_score()
91
- self.reason = self._generate_reason(input, multimodal)
98
+ self.reason = self._generate_reason(input, test_case.multimodal)
92
99
  self.success = self.score >= self.threshold
93
100
  self.verbose_logs = construct_verbose_logs(
94
101
  self,
@@ -113,13 +120,15 @@ class AnswerRelevancyMetric(BaseMetric):
113
120
  _log_metric_to_confident: bool = True,
114
121
  ) -> float:
115
122
 
116
- multimodal = test_case.multimodal
117
- if multimodal:
118
- check_mllm_test_case_params(
119
- test_case, self._required_params, None, None, self, self.model
120
- )
121
- else:
122
- check_llm_test_case_params(test_case, self._required_params, self)
123
+ check_llm_test_case_params(
124
+ test_case,
125
+ self._required_params,
126
+ None,
127
+ None,
128
+ self,
129
+ self.model,
130
+ test_case.multimodal,
131
+ )
123
132
 
124
133
  self.evaluation_cost = 0 if self.using_native_model else None
125
134
  with metric_progress_indicator(
@@ -132,13 +141,15 @@ class AnswerRelevancyMetric(BaseMetric):
132
141
  actual_output = test_case.actual_output
133
142
 
134
143
  self.statements: List[str] = await self._a_generate_statements(
135
- actual_output, multimodal
144
+ actual_output, test_case.multimodal
136
145
  )
137
146
  self.verdicts: List[AnswerRelevancyVerdict] = (
138
- await self._a_generate_verdicts(input, multimodal)
147
+ await self._a_generate_verdicts(input, test_case.multimodal)
139
148
  )
140
149
  self.score = self._calculate_score()
141
- self.reason = await self._a_generate_reason(input, multimodal)
150
+ self.reason = await self._a_generate_reason(
151
+ input, test_case.multimodal
152
+ )
142
153
  self.success = self.score >= self.threshold
143
154
  self.verbose_logs = construct_verbose_logs(
144
155
  self,
@@ -170,22 +181,13 @@ class AnswerRelevancyMetric(BaseMetric):
170
181
  multimodal=multimodal,
171
182
  )
172
183
 
173
- if self.using_native_model:
174
- res, cost = await self.model.a_generate(
175
- prompt, schema=AnswerRelevancyScoreReason
176
- )
177
- self.evaluation_cost += cost
178
- return res.reason
179
- else:
180
- try:
181
- res: AnswerRelevancyScoreReason = await self.model.a_generate(
182
- prompt=prompt, schema=AnswerRelevancyScoreReason
183
- )
184
- return res.reason
185
- except TypeError:
186
- res = await self.model.a_generate(prompt)
187
- data = trimAndLoadJson(res, self)
188
- return data["reason"]
184
+ return await a_generate_with_schema_and_extract(
185
+ metric=self,
186
+ prompt=prompt,
187
+ schema_cls=AnswerRelevancyScoreReason,
188
+ extract_schema=lambda score_reason: score_reason.reason,
189
+ extract_json=lambda data: data["reason"],
190
+ )
189
191
 
190
192
  def _generate_reason(self, input: str, multimodal: bool) -> str:
191
193
  if self.include_reason is False:
@@ -203,22 +205,13 @@ class AnswerRelevancyMetric(BaseMetric):
203
205
  multimodal=multimodal,
204
206
  )
205
207
 
206
- if self.using_native_model:
207
- res, cost = self.model.generate(
208
- prompt, schema=AnswerRelevancyScoreReason
209
- )
210
- self.evaluation_cost += cost
211
- return res.reason
212
- else:
213
- try:
214
- res: AnswerRelevancyScoreReason = self.model.generate(
215
- prompt=prompt, schema=AnswerRelevancyScoreReason
216
- )
217
- return res.reason
218
- except TypeError:
219
- res = self.model.generate(prompt)
220
- data = trimAndLoadJson(res, self)
221
- return data["reason"]
208
+ return generate_with_schema_and_extract(
209
+ metric=self,
210
+ prompt=prompt,
211
+ schema_cls=AnswerRelevancyScoreReason,
212
+ extract_schema=lambda score_reason: score_reason.reason,
213
+ extract_json=lambda data: data["reason"],
214
+ )
222
215
 
223
216
  async def _a_generate_verdicts(
224
217
  self, input: str, multimodal: bool
@@ -230,22 +223,15 @@ class AnswerRelevancyMetric(BaseMetric):
230
223
  input=input, statements=self.statements, multimodal=multimodal
231
224
  )
232
225
 
233
- if self.using_native_model:
234
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
235
- self.evaluation_cost += cost
236
- return [item for item in res.verdicts]
237
- else:
238
- try:
239
- res: Verdicts = await self.model.a_generate(
240
- prompt, schema=Verdicts
241
- )
242
- return [item for item in res.verdicts]
243
- except TypeError:
244
- res = await self.model.a_generate(prompt)
245
- data = trimAndLoadJson(res, self)
246
- return [
247
- AnswerRelevancyVerdict(**item) for item in data["verdicts"]
248
- ]
226
+ return await a_generate_with_schema_and_extract(
227
+ metric=self,
228
+ prompt=prompt,
229
+ schema_cls=Verdicts,
230
+ extract_schema=lambda r: list(r.verdicts),
231
+ extract_json=lambda data: [
232
+ AnswerRelevancyVerdict(**item) for item in data["verdicts"]
233
+ ],
234
+ )
249
235
 
250
236
  def _generate_verdicts(
251
237
  self, input: str, multimodal: bool
@@ -257,22 +243,17 @@ class AnswerRelevancyMetric(BaseMetric):
257
243
  input=input, statements=self.statements, multimodal=multimodal
258
244
  )
259
245
 
260
- if self.using_native_model:
261
- res, cost = self.model.generate(prompt, schema=Verdicts)
262
- self.evaluation_cost += cost
263
- return [item for item in res.verdicts]
264
- else:
265
- try:
266
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
267
- return [item for item in res.verdicts]
268
- except TypeError:
269
- res = self.model.generate(prompt)
270
- data = trimAndLoadJson(res, self)
271
- return [
272
- AnswerRelevancyVerdict(**item) for item in data["verdicts"]
273
- ]
246
+ return generate_with_schema_and_extract(
247
+ metric=self,
248
+ prompt=prompt,
249
+ schema_cls=Verdicts,
250
+ extract_schema=lambda r: list(r.verdicts),
251
+ extract_json=lambda data: [
252
+ AnswerRelevancyVerdict(**item) for item in data["verdicts"]
253
+ ],
254
+ )
274
255
 
275
- async def _a_generate_statements(
256
+ def _generate_statements(
276
257
  self,
277
258
  actual_output: str,
278
259
  multimodal: bool,
@@ -280,31 +261,18 @@ class AnswerRelevancyMetric(BaseMetric):
280
261
  prompt = self.evaluation_template.generate_statements(
281
262
  actual_output=actual_output, multimodal=multimodal
282
263
  )
283
- if self.using_native_model:
284
- res, cost = await self.model.a_generate(prompt, schema=Statements)
285
- self.evaluation_cost += cost
286
- statements: List[str] = res.statements + [
287
- ele for ele in actual_output if isinstance(ele, MLLMImage)
288
- ]
289
- return statements
290
- else:
291
- try:
292
- res: Statements = await self.model.a_generate(
293
- prompt, schema=Statements
294
- )
295
- statements: List[str] = res.statements + [
296
- ele for ele in actual_output if isinstance(ele, MLLMImage)
297
- ]
298
- return statements
299
- except TypeError:
300
- res = await self.model.a_generate(prompt)
301
- data = trimAndLoadJson(res, self)
302
- statements = data["statements"] + [
303
- ele for ele in actual_output if isinstance(ele, MLLMImage)
304
- ]
305
- return statements
306
264
 
307
- def _generate_statements(
265
+ return generate_with_schema_and_extract(
266
+ metric=self,
267
+ prompt=prompt,
268
+ schema_cls=Statements,
269
+ extract_schema=lambda s: s.statements
270
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
271
+ extract_json=lambda d: d["statements"]
272
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
273
+ )
274
+
275
+ async def _a_generate_statements(
308
276
  self,
309
277
  actual_output: str,
310
278
  multimodal: bool,
@@ -312,27 +280,16 @@ class AnswerRelevancyMetric(BaseMetric):
312
280
  prompt = self.evaluation_template.generate_statements(
313
281
  actual_output=actual_output, multimodal=multimodal
314
282
  )
315
- if self.using_native_model:
316
- res, cost = self.model.generate(prompt, schema=Statements)
317
- self.evaluation_cost += cost
318
- statements = res.statements + [
319
- ele for ele in actual_output if isinstance(ele, MLLMImage)
320
- ]
321
- return statements
322
- else:
323
- try:
324
- res: Statements = self.model.generate(prompt, schema=Statements)
325
- statements = res.statements + [
326
- ele for ele in actual_output if isinstance(ele, MLLMImage)
327
- ]
328
- return statements
329
- except TypeError:
330
- res = self.model.generate(prompt)
331
- data = trimAndLoadJson(res, self)
332
- statements = data["statements"] + [
333
- ele for ele in actual_output if isinstance(ele, MLLMImage)
334
- ]
335
- return statements
283
+
284
+ return await a_generate_with_schema_and_extract(
285
+ metric=self,
286
+ prompt=prompt,
287
+ schema_cls=Statements,
288
+ extract_schema=lambda s: s.statements
289
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
290
+ extract_json=lambda d: d["statements"]
291
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
292
+ )
336
293
 
337
294
  def _calculate_score(self):
338
295
  number_of_verdicts = len(self.verdicts)
@@ -353,7 +310,7 @@ class AnswerRelevancyMetric(BaseMetric):
353
310
  else:
354
311
  try:
355
312
  self.success = self.score >= self.threshold
356
- except:
313
+ except TypeError:
357
314
  self.success = False
358
315
  return self.success
359
316