deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,62 +1,125 @@
1
- import warnings
2
-
3
- from typing import Optional, Tuple, Union, Dict
4
- from anthropic import Anthropic, AsyncAnthropic
1
+ from typing import Optional, Tuple, Union, Dict, List
5
2
  from pydantic import BaseModel, SecretStr
6
3
 
4
+ from deepeval.errors import DeepEvalError
7
5
  from deepeval.models import DeepEvalBaseLLM
8
6
  from deepeval.models.llms.utils import trim_and_load_json
9
7
  from deepeval.models.retry_policy import (
10
8
  create_retry_decorator,
11
9
  sdk_retries_for,
12
10
  )
13
- from deepeval.models.utils import parse_model_name, require_secret_api_key
11
+ from deepeval.models.utils import (
12
+ require_costs,
13
+ require_secret_api_key,
14
+ normalize_kwargs_and_extract_aliases,
15
+ )
16
+ from deepeval.test_case import MLLMImage
17
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
14
18
  from deepeval.config.settings import get_settings
15
19
  from deepeval.constants import ProviderSlug as PS
16
-
20
+ from deepeval.utils import require_dependency, require_param
21
+ from deepeval.models.llms.constants import ANTHROPIC_MODELS_DATA
17
22
 
18
23
  # consistent retry rules
19
24
  retry_anthropic = create_retry_decorator(PS.ANTHROPIC)
20
25
 
21
- model_pricing = {
22
- "claude-opus-4-20250514": {"input": 15.00 / 1e6, "output": 75.00 / 1e6},
23
- "claude-sonnet-4-20250514": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
24
- "claude-3-7-sonnet-latest": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
25
- "claude-3-5-haiku-latest": {"input": 0.80 / 1e6, "output": 4.00 / 1e6},
26
- "claude-3-5-sonnet-latest": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
27
- "claude-3-opus-latest": {"input": 15.00 / 1e6, "output": 75.00 / 1e6},
28
- "claude-3-sonnet-20240229": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
29
- "claude-3-haiku-20240307": {"input": 0.25 / 1e6, "output": 1.25 / 1e6},
30
- "claude-instant-1.2": {"input": 0.80 / 1e6, "output": 2.40 / 1e6},
26
+ _ALIAS_MAP = {
27
+ "api_key": ["_anthropic_api_key"],
31
28
  }
32
29
 
30
+ default_model = "claude-3-7-sonnet-latest"
31
+
33
32
 
34
33
  class AnthropicModel(DeepEvalBaseLLM):
35
34
  def __init__(
36
35
  self,
37
- model: str = "claude-3-7-sonnet-latest",
38
- temperature: float = 0,
39
- _anthropic_api_key: Optional[str] = None,
36
+ model: Optional[str] = None,
37
+ api_key: Optional[str] = None,
38
+ temperature: Optional[float] = None,
39
+ cost_per_input_token: Optional[float] = None,
40
+ cost_per_output_token: Optional[float] = None,
40
41
  generation_kwargs: Optional[Dict] = None,
41
42
  **kwargs,
42
43
  ):
43
- model_name = parse_model_name(model)
44
+ settings = get_settings()
45
+ normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
46
+ "AnthropicModel",
47
+ kwargs,
48
+ _ALIAS_MAP,
49
+ )
44
50
 
45
- if _anthropic_api_key is not None:
51
+ # re-map depricated keywords to re-named positional args
52
+ if api_key is None and "api_key" in alias_values:
53
+ api_key = alias_values["api_key"]
54
+
55
+ if api_key is not None:
46
56
  # keep it secret, keep it safe from serializings, logging and alike
47
- self._anthropic_api_key: SecretStr | None = SecretStr(
48
- _anthropic_api_key
49
- )
57
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
50
58
  else:
51
- self._anthropic_api_key = get_settings().ANTHROPIC_API_KEY
59
+ self.api_key = settings.ANTHROPIC_API_KEY
60
+
61
+ model = model or settings.ANTHROPIC_MODEL_NAME or default_model
62
+
63
+ if temperature is not None:
64
+ temperature = float(temperature)
65
+ elif settings.TEMPERATURE is not None:
66
+ temperature = settings.TEMPERATURE
67
+ else:
68
+ temperature = 0.0
69
+
70
+ cost_per_input_token = (
71
+ cost_per_input_token
72
+ if cost_per_input_token is not None
73
+ else settings.ANTHROPIC_COST_PER_INPUT_TOKEN
74
+ )
75
+ cost_per_output_token = (
76
+ cost_per_output_token
77
+ if cost_per_output_token is not None
78
+ else settings.ANTHROPIC_COST_PER_OUTPUT_TOKEN
79
+ )
80
+
81
+ # Validation
82
+ model = require_param(
83
+ model,
84
+ provider_label="AnthropicModel",
85
+ env_var_name="ANTHROPIC_MODEL_NAME",
86
+ param_hint="model",
87
+ )
52
88
 
53
89
  if temperature < 0:
54
- raise ValueError("Temperature must be >= 0.")
90
+ raise DeepEvalError("Temperature must be >= 0.")
55
91
  self.temperature = temperature
56
92
 
57
- self.kwargs = kwargs
58
- self.generation_kwargs = generation_kwargs or {}
59
- super().__init__(model_name)
93
+ self.model_data = ANTHROPIC_MODELS_DATA.get(model)
94
+
95
+ cost_per_input_token, cost_per_output_token = require_costs(
96
+ self.model_data,
97
+ model,
98
+ "ANTHROPIC_COST_PER_INPUT_TOKEN",
99
+ "ANTHROPIC_COST_PER_OUTPUT_TOKEN",
100
+ cost_per_input_token,
101
+ cost_per_output_token,
102
+ )
103
+ self.model_data.input_price = cost_per_input_token
104
+ self.model_data.output_price = cost_per_output_token
105
+
106
+ # Keep sanitized kwargs for client call to strip legacy keys
107
+ self.kwargs = normalized_kwargs
108
+ self.kwargs.pop(
109
+ "temperature", None
110
+ ) # to avoid duplicate with self.temperature
111
+ max_tokens = self.kwargs.pop("max_tokens", None)
112
+
113
+ self.generation_kwargs = dict(generation_kwargs or {})
114
+ self.generation_kwargs.pop(
115
+ "temperature", None
116
+ ) # to avoid duplicate with self.temperature
117
+ default_max_tokens = 1024 if max_tokens is None else max_tokens
118
+ self._max_tokens = int(
119
+ self.generation_kwargs.pop("max_tokens", default_max_tokens)
120
+ )
121
+
122
+ super().__init__(model)
60
123
 
61
124
  ###############################################
62
125
  # Generate functions
@@ -65,17 +128,25 @@ class AnthropicModel(DeepEvalBaseLLM):
65
128
  @retry_anthropic
66
129
  def generate(
67
130
  self, prompt: str, schema: Optional[BaseModel] = None
68
- ) -> Tuple[Union[str, Dict], float]:
131
+ ) -> Tuple[Union[str, BaseModel], float]:
132
+ if check_if_multimodal(prompt):
133
+ prompt = convert_to_multi_modal_array(input=prompt)
134
+ content = self.generate_content(prompt)
135
+ else:
136
+ content = [{"type": "text", "text": prompt}]
137
+
138
+ # Get max_tokens from kwargs, default to 1024 if not provided
139
+ max_tokens = self._max_tokens
69
140
  chat_model = self.load_model()
70
141
  message = chat_model.messages.create(
71
- max_tokens=1024,
142
+ max_tokens=max_tokens,
72
143
  messages=[
73
144
  {
74
145
  "role": "user",
75
- "content": prompt,
146
+ "content": content,
76
147
  }
77
148
  ],
78
- model=self.model_name,
149
+ model=self.name,
79
150
  temperature=self.temperature,
80
151
  **self.generation_kwargs,
81
152
  )
@@ -91,17 +162,25 @@ class AnthropicModel(DeepEvalBaseLLM):
91
162
  @retry_anthropic
92
163
  async def a_generate(
93
164
  self, prompt: str, schema: Optional[BaseModel] = None
94
- ) -> Tuple[str, float]:
165
+ ) -> Tuple[Union[str, BaseModel], float]:
166
+ if check_if_multimodal(prompt):
167
+ prompt = convert_to_multi_modal_array(input=prompt)
168
+ content = self.generate_content(prompt)
169
+ else:
170
+ content = [{"type": "text", "text": prompt}]
171
+
172
+ # Get max_tokens from kwargs, default to 1024 if not provided
173
+ max_tokens = self._max_tokens
95
174
  chat_model = self.load_model(async_mode=True)
96
175
  message = await chat_model.messages.create(
97
- max_tokens=1024,
176
+ max_tokens=max_tokens,
98
177
  messages=[
99
178
  {
100
179
  "role": "user",
101
- "content": prompt,
180
+ "content": content,
102
181
  }
103
182
  ],
104
- model=self.model_name,
183
+ model=self.name,
105
184
  temperature=self.temperature,
106
185
  **self.generation_kwargs,
107
186
  )
@@ -115,43 +194,76 @@ class AnthropicModel(DeepEvalBaseLLM):
115
194
 
116
195
  return schema.model_validate(json_output), cost
117
196
 
197
+ def generate_content(self, multimodal_input: List[Union[str, MLLMImage]]):
198
+ content = []
199
+ for element in multimodal_input:
200
+ if isinstance(element, str):
201
+ content.append({"type": "text", "text": element})
202
+ elif isinstance(element, MLLMImage):
203
+ if element.url and not element.local:
204
+ content.append(
205
+ {
206
+ "type": "image",
207
+ "source": {"type": "url", "url": element.url},
208
+ }
209
+ )
210
+ else:
211
+ element.ensure_images_loaded()
212
+ mime_type = element.mimeType or "image/jpeg"
213
+ content.append(
214
+ {
215
+ "type": "image",
216
+ "source": {
217
+ "type": "base64",
218
+ "media_type": mime_type,
219
+ "data": element.dataBase64,
220
+ },
221
+ }
222
+ )
223
+ return content
224
+
118
225
  ###############################################
119
226
  # Utilities
120
227
  ###############################################
121
228
 
122
229
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
123
- pricing = model_pricing.get(self.model_name)
124
-
125
- if pricing is None:
126
- # Calculate average cost from all known models
127
- avg_input_cost = sum(
128
- p["input"] for p in model_pricing.values()
129
- ) / len(model_pricing)
130
- avg_output_cost = sum(
131
- p["output"] for p in model_pricing.values()
132
- ) / len(model_pricing)
133
- pricing = {"input": avg_input_cost, "output": avg_output_cost}
134
-
135
- warnings.warn(
136
- f"[Warning] Pricing not defined for model '{self.model_name}'. "
137
- "Using average input/output token costs from existing model_pricing."
138
- )
139
-
140
- input_cost = input_tokens * pricing["input"]
141
- output_cost = output_tokens * pricing["output"]
230
+ input_cost = input_tokens * self.model_data.input_price
231
+ output_cost = output_tokens * self.model_data.output_price
142
232
  return input_cost + output_cost
143
233
 
234
+ #########################
235
+ # Capabilities #
236
+ #########################
237
+
238
+ def supports_log_probs(self) -> Union[bool, None]:
239
+ return self.model_data.supports_log_probs
240
+
241
+ def supports_temperature(self) -> Union[bool, None]:
242
+ return self.model_data.supports_temperature
243
+
244
+ def supports_multimodal(self) -> Union[bool, None]:
245
+ return self.model_data.supports_multimodal
246
+
247
+ def supports_structured_outputs(self) -> Union[bool, None]:
248
+ return self.model_data.supports_structured_outputs
249
+
250
+ def supports_json_mode(self) -> Union[bool, None]:
251
+ return self.model_data.supports_json
252
+
144
253
  ###############################################
145
254
  # Model
146
255
  ###############################################
147
256
 
148
257
  def load_model(self, async_mode: bool = False):
149
- if not async_mode:
150
- return self._build_client(Anthropic)
151
- return self._build_client(AsyncAnthropic)
258
+ module = require_dependency(
259
+ "anthropic",
260
+ provider_label="AnthropicModel",
261
+ install_hint="Install it with `pip install anthropic`.",
262
+ )
152
263
 
153
- def get_model_name(self):
154
- return f"{self.model_name}"
264
+ if not async_mode:
265
+ return self._build_client(module.Anthropic)
266
+ return self._build_client(module.AsyncAnthropic)
155
267
 
156
268
  def _client_kwargs(self) -> Dict:
157
269
  kwargs = dict(self.kwargs or {})
@@ -163,10 +275,10 @@ class AnthropicModel(DeepEvalBaseLLM):
163
275
 
164
276
  def _build_client(self, cls):
165
277
  api_key = require_secret_api_key(
166
- self._anthropic_api_key,
278
+ self.api_key,
167
279
  provider_label="Anthropic",
168
280
  env_var_name="ANTHROPIC_API_KEY",
169
- param_hint="`_anthropic_api_key` to AnthropicModel(...)",
281
+ param_hint="`api_key` to AnthropicModel(...)",
170
282
  )
171
283
  kw = dict(
172
284
  api_key=api_key,
@@ -180,3 +292,6 @@ class AnthropicModel(DeepEvalBaseLLM):
180
292
  kw.pop("max_retries", None)
181
293
  return cls(**kw)
182
294
  raise
295
+
296
+ def get_model_name(self):
297
+ return f"{self.name} (Anthropic)"