deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,7 @@
1
- import warnings
2
-
3
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
4
2
  from pydantic import BaseModel, SecretStr
5
3
 
4
+ from deepeval.errors import DeepEvalError
6
5
  from deepeval.models import DeepEvalBaseLLM
7
6
  from deepeval.models.llms.utils import trim_and_load_json
8
7
  from deepeval.models.retry_policy import (
@@ -10,42 +9,39 @@ from deepeval.models.retry_policy import (
10
9
  sdk_retries_for,
11
10
  )
12
11
  from deepeval.models.utils import (
12
+ require_costs,
13
13
  require_secret_api_key,
14
14
  normalize_kwargs_and_extract_aliases,
15
15
  )
16
+ from deepeval.test_case import MLLMImage
17
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
16
18
  from deepeval.config.settings import get_settings
17
19
  from deepeval.constants import ProviderSlug as PS
18
- from deepeval.utils import require_dependency
20
+ from deepeval.utils import require_dependency, require_param
21
+ from deepeval.models.llms.constants import ANTHROPIC_MODELS_DATA
19
22
 
20
23
  # consistent retry rules
21
24
  retry_anthropic = create_retry_decorator(PS.ANTHROPIC)
22
25
 
23
- model_pricing = {
24
- "claude-opus-4-20250514": {"input": 15.00 / 1e6, "output": 75.00 / 1e6},
25
- "claude-sonnet-4-20250514": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
26
- "claude-3-7-sonnet-latest": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
27
- "claude-3-5-haiku-latest": {"input": 0.80 / 1e6, "output": 4.00 / 1e6},
28
- "claude-3-5-sonnet-latest": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
29
- "claude-3-opus-latest": {"input": 15.00 / 1e6, "output": 75.00 / 1e6},
30
- "claude-3-sonnet-20240229": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
31
- "claude-3-haiku-20240307": {"input": 0.25 / 1e6, "output": 1.25 / 1e6},
32
- "claude-instant-1.2": {"input": 0.80 / 1e6, "output": 2.40 / 1e6},
33
- }
34
-
35
26
  _ALIAS_MAP = {
36
27
  "api_key": ["_anthropic_api_key"],
37
28
  }
38
29
 
30
+ default_model = "claude-3-7-sonnet-latest"
31
+
39
32
 
40
33
  class AnthropicModel(DeepEvalBaseLLM):
41
34
  def __init__(
42
35
  self,
43
- model: str = "claude-3-7-sonnet-latest",
36
+ model: Optional[str] = None,
44
37
  api_key: Optional[str] = None,
45
- temperature: float = 0,
38
+ temperature: Optional[float] = None,
39
+ cost_per_input_token: Optional[float] = None,
40
+ cost_per_output_token: Optional[float] = None,
46
41
  generation_kwargs: Optional[Dict] = None,
47
42
  **kwargs,
48
43
  ):
44
+ settings = get_settings()
49
45
  normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
50
46
  "AnthropicModel",
51
47
  kwargs,
@@ -58,17 +54,71 @@ class AnthropicModel(DeepEvalBaseLLM):
58
54
 
59
55
  if api_key is not None:
60
56
  # keep it secret, keep it safe from serializings, logging and alike
61
- self.api_key: SecretStr | None = SecretStr(api_key)
57
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
58
+ else:
59
+ self.api_key = settings.ANTHROPIC_API_KEY
60
+
61
+ model = model or settings.ANTHROPIC_MODEL_NAME or default_model
62
+
63
+ if temperature is not None:
64
+ temperature = float(temperature)
65
+ elif settings.TEMPERATURE is not None:
66
+ temperature = settings.TEMPERATURE
62
67
  else:
63
- self.api_key = get_settings().ANTHROPIC_API_KEY
68
+ temperature = 0.0
69
+
70
+ cost_per_input_token = (
71
+ cost_per_input_token
72
+ if cost_per_input_token is not None
73
+ else settings.ANTHROPIC_COST_PER_INPUT_TOKEN
74
+ )
75
+ cost_per_output_token = (
76
+ cost_per_output_token
77
+ if cost_per_output_token is not None
78
+ else settings.ANTHROPIC_COST_PER_OUTPUT_TOKEN
79
+ )
80
+
81
+ # Validation
82
+ model = require_param(
83
+ model,
84
+ provider_label="AnthropicModel",
85
+ env_var_name="ANTHROPIC_MODEL_NAME",
86
+ param_hint="model",
87
+ )
64
88
 
65
89
  if temperature < 0:
66
- raise ValueError("Temperature must be >= 0.")
90
+ raise DeepEvalError("Temperature must be >= 0.")
67
91
  self.temperature = temperature
68
92
 
93
+ self.model_data = ANTHROPIC_MODELS_DATA.get(model)
94
+
95
+ cost_per_input_token, cost_per_output_token = require_costs(
96
+ self.model_data,
97
+ model,
98
+ "ANTHROPIC_COST_PER_INPUT_TOKEN",
99
+ "ANTHROPIC_COST_PER_OUTPUT_TOKEN",
100
+ cost_per_input_token,
101
+ cost_per_output_token,
102
+ )
103
+ self.model_data.input_price = cost_per_input_token
104
+ self.model_data.output_price = cost_per_output_token
105
+
69
106
  # Keep sanitized kwargs for client call to strip legacy keys
70
107
  self.kwargs = normalized_kwargs
71
- self.generation_kwargs = generation_kwargs or {}
108
+ self.kwargs.pop(
109
+ "temperature", None
110
+ ) # to avoid duplicate with self.temperature
111
+ max_tokens = self.kwargs.pop("max_tokens", None)
112
+
113
+ self.generation_kwargs = dict(generation_kwargs or {})
114
+ self.generation_kwargs.pop(
115
+ "temperature", None
116
+ ) # to avoid duplicate with self.temperature
117
+ default_max_tokens = 1024 if max_tokens is None else max_tokens
118
+ self._max_tokens = int(
119
+ self.generation_kwargs.pop("max_tokens", default_max_tokens)
120
+ )
121
+
72
122
  super().__init__(model)
73
123
 
74
124
  ###############################################
@@ -78,15 +128,22 @@ class AnthropicModel(DeepEvalBaseLLM):
78
128
  @retry_anthropic
79
129
  def generate(
80
130
  self, prompt: str, schema: Optional[BaseModel] = None
81
- ) -> Tuple[Union[str, Dict], float]:
131
+ ) -> Tuple[Union[str, BaseModel], float]:
132
+ if check_if_multimodal(prompt):
133
+ prompt = convert_to_multi_modal_array(input=prompt)
134
+ content = self.generate_content(prompt)
135
+ else:
136
+ content = [{"type": "text", "text": prompt}]
82
137
 
138
+ # Get max_tokens from kwargs, default to 1024 if not provided
139
+ max_tokens = self._max_tokens
83
140
  chat_model = self.load_model()
84
141
  message = chat_model.messages.create(
85
- max_tokens=1024,
142
+ max_tokens=max_tokens,
86
143
  messages=[
87
144
  {
88
145
  "role": "user",
89
- "content": prompt,
146
+ "content": content,
90
147
  }
91
148
  ],
92
149
  model=self.name,
@@ -105,15 +162,22 @@ class AnthropicModel(DeepEvalBaseLLM):
105
162
  @retry_anthropic
106
163
  async def a_generate(
107
164
  self, prompt: str, schema: Optional[BaseModel] = None
108
- ) -> Tuple[str, float]:
165
+ ) -> Tuple[Union[str, BaseModel], float]:
166
+ if check_if_multimodal(prompt):
167
+ prompt = convert_to_multi_modal_array(input=prompt)
168
+ content = self.generate_content(prompt)
169
+ else:
170
+ content = [{"type": "text", "text": prompt}]
109
171
 
172
+ # Get max_tokens from kwargs, default to 1024 if not provided
173
+ max_tokens = self._max_tokens
110
174
  chat_model = self.load_model(async_mode=True)
111
175
  message = await chat_model.messages.create(
112
- max_tokens=1024,
176
+ max_tokens=max_tokens,
113
177
  messages=[
114
178
  {
115
179
  "role": "user",
116
- "content": prompt,
180
+ "content": content,
117
181
  }
118
182
  ],
119
183
  model=self.name,
@@ -130,31 +194,62 @@ class AnthropicModel(DeepEvalBaseLLM):
130
194
 
131
195
  return schema.model_validate(json_output), cost
132
196
 
197
+ def generate_content(self, multimodal_input: List[Union[str, MLLMImage]]):
198
+ content = []
199
+ for element in multimodal_input:
200
+ if isinstance(element, str):
201
+ content.append({"type": "text", "text": element})
202
+ elif isinstance(element, MLLMImage):
203
+ if element.url and not element.local:
204
+ content.append(
205
+ {
206
+ "type": "image",
207
+ "source": {"type": "url", "url": element.url},
208
+ }
209
+ )
210
+ else:
211
+ element.ensure_images_loaded()
212
+ mime_type = element.mimeType or "image/jpeg"
213
+ content.append(
214
+ {
215
+ "type": "image",
216
+ "source": {
217
+ "type": "base64",
218
+ "media_type": mime_type,
219
+ "data": element.dataBase64,
220
+ },
221
+ }
222
+ )
223
+ return content
224
+
133
225
  ###############################################
134
226
  # Utilities
135
227
  ###############################################
136
228
 
137
229
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
138
- pricing = model_pricing.get(self.name)
139
-
140
- if pricing is None:
141
- # Calculate average cost from all known models
142
- avg_input_cost = sum(
143
- p["input"] for p in model_pricing.values()
144
- ) / len(model_pricing)
145
- avg_output_cost = sum(
146
- p["output"] for p in model_pricing.values()
147
- ) / len(model_pricing)
148
- pricing = {"input": avg_input_cost, "output": avg_output_cost}
149
-
150
- warnings.warn(
151
- f"[Warning] Pricing not defined for model '{self.name}'. "
152
- "Using average input/output token costs from existing model_pricing."
153
- )
154
-
155
- input_cost = input_tokens * pricing["input"]
156
- output_cost = output_tokens * pricing["output"]
157
- return input_cost + output_cost
230
+ if self.model_data.input_price and self.model_data.output_price:
231
+ input_cost = input_tokens * self.model_data.input_price
232
+ output_cost = output_tokens * self.model_data.output_price
233
+ return input_cost + output_cost
234
+
235
+ #########################
236
+ # Capabilities #
237
+ #########################
238
+
239
+ def supports_log_probs(self) -> Union[bool, None]:
240
+ return self.model_data.supports_log_probs
241
+
242
+ def supports_temperature(self) -> Union[bool, None]:
243
+ return self.model_data.supports_temperature
244
+
245
+ def supports_multimodal(self) -> Union[bool, None]:
246
+ return self.model_data.supports_multimodal
247
+
248
+ def supports_structured_outputs(self) -> Union[bool, None]:
249
+ return self.model_data.supports_structured_outputs
250
+
251
+ def supports_json_mode(self) -> Union[bool, None]:
252
+ return self.model_data.supports_json
158
253
 
159
254
  ###############################################
160
255
  # Model