deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -2,66 +2,101 @@ from typing import Optional, Tuple, Union, Dict
2
2
  from openai import OpenAI, AsyncOpenAI
3
3
  from pydantic import BaseModel, SecretStr
4
4
 
5
+ from deepeval.errors import DeepEvalError
5
6
  from deepeval.config.settings import get_settings
6
7
  from deepeval.models.llms.utils import trim_and_load_json
7
- from deepeval.models.utils import require_secret_api_key
8
+ from deepeval.models.utils import (
9
+ require_costs,
10
+ require_secret_api_key,
11
+ )
8
12
  from deepeval.models import DeepEvalBaseLLM
9
13
  from deepeval.models.retry_policy import (
10
14
  create_retry_decorator,
11
15
  sdk_retries_for,
12
16
  )
13
17
  from deepeval.constants import ProviderSlug as PS
18
+ from deepeval.models.llms.constants import DEEPSEEK_MODELS_DATA
19
+ from deepeval.utils import require_param
14
20
 
15
21
 
16
22
  # consistent retry rules
17
23
  retry_deepseek = create_retry_decorator(PS.DEEPSEEK)
18
24
 
19
- model_pricing = {
20
- "deepseek-chat": {
21
- "input": 0.27 / 1e6,
22
- "output": 1.10 / 1e6,
23
- },
24
- "deepseek-reasoner": {
25
- "input": 0.55 / 1e6,
26
- "output": 2.19 / 1e6,
27
- },
28
- }
29
-
30
25
 
31
26
  class DeepSeekModel(DeepEvalBaseLLM):
32
27
  def __init__(
33
28
  self,
34
- api_key: Optional[str] = None,
35
29
  model: Optional[str] = None,
36
- temperature: float = 0,
30
+ api_key: Optional[str] = None,
31
+ temperature: Optional[float] = None,
32
+ cost_per_input_token: Optional[float] = None,
33
+ cost_per_output_token: Optional[float] = None,
37
34
  generation_kwargs: Optional[Dict] = None,
38
35
  **kwargs,
39
36
  ):
40
37
  settings = get_settings()
41
38
 
42
- model_name = model or settings.DEEPSEEK_MODEL_NAME
43
- if model_name not in model_pricing:
44
- raise ValueError(
45
- f"Invalid model. Available DeepSeek models: {', '.join(model_pricing.keys())}"
46
- )
47
- temperature_from_key = settings.TEMPERATURE
48
- if temperature_from_key is None:
49
- self.temperature = temperature
39
+ model = model or settings.DEEPSEEK_MODEL_NAME
40
+
41
+ if temperature is not None:
42
+ temperature = float(temperature)
43
+ elif settings.TEMPERATURE is not None:
44
+ temperature = settings.TEMPERATURE
50
45
  else:
51
- self.temperature = float(temperature_from_key)
52
- if self.temperature < 0:
53
- raise ValueError("Temperature must be >= 0.")
46
+ temperature = 0.0
47
+
48
+ cost_per_input_token = (
49
+ cost_per_input_token
50
+ if cost_per_input_token is not None
51
+ else settings.DEEPSEEK_COST_PER_INPUT_TOKEN
52
+ )
53
+ cost_per_output_token = (
54
+ cost_per_output_token
55
+ if cost_per_output_token is not None
56
+ else settings.DEEPSEEK_COST_PER_OUTPUT_TOKEN
57
+ )
54
58
 
55
59
  if api_key is not None:
56
60
  # keep it secret, keep it safe from serializings, logging and alike
57
- self.api_key: SecretStr | None = SecretStr(api_key)
61
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
58
62
  else:
59
63
  self.api_key = settings.DEEPSEEK_API_KEY
60
64
 
61
65
  self.base_url = "https://api.deepseek.com"
66
+
67
+ # validation
68
+ model = require_param(
69
+ model,
70
+ provider_label="DeepSeekModel",
71
+ env_var_name="DEEPSEEK_MODEL_NAME",
72
+ param_hint="model",
73
+ )
74
+
75
+ if temperature < 0:
76
+ raise DeepEvalError("Temperature must be >= 0.")
77
+
78
+ self.model_data = DEEPSEEK_MODELS_DATA.get(model)
79
+ self.temperature = temperature
80
+
81
+ cost_per_input_token, cost_per_output_token = require_costs(
82
+ self.model_data,
83
+ model,
84
+ "DEEPSEEK_COST_PER_INPUT_TOKEN",
85
+ "DEEPSEEK_COST_PER_OUTPUT_TOKEN",
86
+ cost_per_input_token,
87
+ cost_per_output_token,
88
+ )
89
+ self.model_data.input_price = cost_per_input_token
90
+ self.model_data.output_price = cost_per_output_token
91
+
92
+ # Keep sanitized kwargs for client call to strip legacy keys
62
93
  self.kwargs = kwargs
63
- self.generation_kwargs = generation_kwargs or {}
64
- super().__init__(model_name)
94
+ self.kwargs.pop("temperature", None)
95
+
96
+ self.generation_kwargs = dict(generation_kwargs or {})
97
+ self.generation_kwargs.pop("temperature", None)
98
+
99
+ super().__init__(model)
65
100
 
66
101
  ###############################################
67
102
  # Other generate functions
@@ -70,11 +105,12 @@ class DeepSeekModel(DeepEvalBaseLLM):
70
105
  @retry_deepseek
71
106
  def generate(
72
107
  self, prompt: str, schema: Optional[BaseModel] = None
73
- ) -> Tuple[Union[str, Dict], float]:
108
+ ) -> Tuple[Union[str, BaseModel], float]:
109
+
74
110
  client = self.load_model(async_mode=False)
75
111
  if schema:
76
112
  completion = client.chat.completions.create(
77
- model=self.model_name,
113
+ model=self.name,
78
114
  messages=[{"role": "user", "content": prompt}],
79
115
  response_format={"type": "json_object"},
80
116
  temperature=self.temperature,
@@ -90,7 +126,7 @@ class DeepSeekModel(DeepEvalBaseLLM):
90
126
  return schema.model_validate(json_output), cost
91
127
  else:
92
128
  completion = client.chat.completions.create(
93
- model=self.model_name,
129
+ model=self.name,
94
130
  messages=[{"role": "user", "content": prompt}],
95
131
  **self.generation_kwargs,
96
132
  )
@@ -104,11 +140,12 @@ class DeepSeekModel(DeepEvalBaseLLM):
104
140
  @retry_deepseek
105
141
  async def a_generate(
106
142
  self, prompt: str, schema: Optional[BaseModel] = None
107
- ) -> Tuple[Union[str, Dict], float]:
143
+ ) -> Tuple[Union[str, BaseModel], float]:
144
+
108
145
  client = self.load_model(async_mode=True)
109
146
  if schema:
110
147
  completion = await client.chat.completions.create(
111
- model=self.model_name,
148
+ model=self.name,
112
149
  messages=[{"role": "user", "content": prompt}],
113
150
  response_format={"type": "json_object"},
114
151
  temperature=self.temperature,
@@ -124,7 +161,7 @@ class DeepSeekModel(DeepEvalBaseLLM):
124
161
  return schema.model_validate(json_output), cost
125
162
  else:
126
163
  completion = await client.chat.completions.create(
127
- model=self.model_name,
164
+ model=self.name,
128
165
  messages=[{"role": "user", "content": prompt}],
129
166
  **self.generation_kwargs,
130
167
  )
@@ -144,11 +181,29 @@ class DeepSeekModel(DeepEvalBaseLLM):
144
181
  input_tokens: int,
145
182
  output_tokens: int,
146
183
  ) -> float:
147
- pricing = model_pricing.get(self.model_name, model_pricing)
148
- input_cost = input_tokens * pricing["input"]
149
- output_cost = output_tokens * pricing["output"]
184
+ input_cost = input_tokens * self.model_data.input_price
185
+ output_cost = output_tokens * self.model_data.output_price
150
186
  return input_cost + output_cost
151
187
 
188
+ ###############################################
189
+ # Capabilities
190
+ ###############################################
191
+
192
+ def supports_log_probs(self) -> Union[bool, None]:
193
+ return self.model_data.supports_log_probs
194
+
195
+ def supports_temperature(self) -> Union[bool, None]:
196
+ return self.model_data.supports_temperature
197
+
198
+ def supports_multimodal(self) -> Union[bool, None]:
199
+ return self.model_data.supports_multimodal
200
+
201
+ def supports_structured_outputs(self) -> Union[bool, None]:
202
+ return self.model_data.supports_structured_outputs
203
+
204
+ def supports_json_mode(self) -> Union[bool, None]:
205
+ return self.model_data.supports_json
206
+
152
207
  ###############################################
153
208
  # Model
154
209
  ###############################################
@@ -158,9 +213,6 @@ class DeepSeekModel(DeepEvalBaseLLM):
158
213
  return self._build_client(OpenAI)
159
214
  return self._build_client(AsyncOpenAI)
160
215
 
161
- def get_model_name(self):
162
- return f"{self.model_name}"
163
-
164
216
  def _client_kwargs(self) -> Dict:
165
217
  kwargs = dict(self.kwargs or {})
166
218
  # if we are managing retries with Tenacity, force SDK retries off to avoid double retries.
@@ -190,3 +242,6 @@ class DeepSeekModel(DeepEvalBaseLLM):
190
242
  kw.pop("max_retries", None)
191
243
  return cls(**kw)
192
244
  raise
245
+
246
+ def get_model_name(self):
247
+ return f"{self.name} (Deepseek)"
@@ -1,17 +1,26 @@
1
1
  import json
2
-
2
+ import base64
3
3
  from pydantic import BaseModel, SecretStr
4
- from google.genai import types, Client
5
- from typing import Optional, Dict
4
+ from typing import TYPE_CHECKING, Optional, Dict, List, Union, Tuple
6
5
 
6
+ from deepeval.errors import DeepEvalError
7
+ from deepeval.test_case import MLLMImage
7
8
  from deepeval.config.settings import get_settings
8
9
  from deepeval.models.utils import require_secret_api_key
9
10
  from deepeval.models.retry_policy import (
10
11
  create_retry_decorator,
11
12
  )
13
+ from deepeval.utils import (
14
+ convert_to_multi_modal_array,
15
+ check_if_multimodal,
16
+ require_dependency,
17
+ )
12
18
  from deepeval.models.base_model import DeepEvalBaseLLM
13
19
  from deepeval.constants import ProviderSlug as PS
14
- from google.oauth2 import service_account
20
+ from deepeval.models.llms.constants import GEMINI_MODELS_DATA
21
+
22
+ if TYPE_CHECKING:
23
+ from google.genai import Client
15
24
 
16
25
  default_gemini_model = "gemini-1.5-pro"
17
26
 
@@ -28,7 +37,7 @@ class GeminiModel(DeepEvalBaseLLM):
28
37
  To use Vertex AI API, set project and location attributes.
29
38
 
30
39
  Attributes:
31
- model_name: Name of the Gemini model to use
40
+ model: Name of the Gemini model to use
32
41
  api_key: Google API key for authentication
33
42
  project: Google Cloud project ID
34
43
  location: Google Cloud location
@@ -39,7 +48,7 @@ class GeminiModel(DeepEvalBaseLLM):
39
48
 
40
49
  # Initialize the model
41
50
  model = GeminiModel(
42
- model_name="gemini-1.5-pro-001",
51
+ model="gemini-1.5-pro-001",
43
52
  api_key="your-api-key"
44
53
  )
45
54
 
@@ -50,75 +59,89 @@ class GeminiModel(DeepEvalBaseLLM):
50
59
 
51
60
  def __init__(
52
61
  self,
53
- model_name: Optional[str] = None,
62
+ model: Optional[str] = None,
54
63
  api_key: Optional[str] = None,
64
+ temperature: Optional[float] = None,
55
65
  project: Optional[str] = None,
56
66
  location: Optional[str] = None,
57
- service_account_key: Optional[Dict[str, str]] = None,
58
- temperature: float = 0,
67
+ service_account_key: Optional[Union[str, Dict[str, str]]] = None,
59
68
  generation_kwargs: Optional[Dict] = None,
60
69
  **kwargs,
61
70
  ):
62
71
 
63
72
  settings = get_settings()
64
73
 
65
- model_name = (
66
- model_name or settings.GEMINI_MODEL_NAME or default_gemini_model
67
- )
74
+ model = model or settings.GEMINI_MODEL_NAME or default_gemini_model
75
+ self.model_data = GEMINI_MODELS_DATA.get(model)
68
76
 
69
77
  # Get API key from settings if not provided
70
78
  if api_key is not None:
71
79
  # keep it secret, keep it safe from serializings, logging and aolike
72
- self.api_key: SecretStr | None = SecretStr(api_key)
80
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
73
81
  else:
74
82
  self.api_key = settings.GOOGLE_API_KEY
75
83
 
84
+ if temperature is not None:
85
+ temperature = float(temperature)
86
+ elif settings.TEMPERATURE is not None:
87
+ temperature = settings.TEMPERATURE
88
+ else:
89
+ temperature = 0.0
90
+
76
91
  self.project = project or settings.GOOGLE_CLOUD_PROJECT
77
- self.location = (
78
- location
79
- or settings.GOOGLE_CLOUD_LOCATION is not None
80
- and str(settings.GOOGLE_CLOUD_LOCATION)
92
+ location = (
93
+ location if location is not None else settings.GOOGLE_CLOUD_LOCATION
81
94
  )
95
+ self.location = str(location).strip() if location is not None else None
82
96
  self.use_vertexai = settings.GOOGLE_GENAI_USE_VERTEXAI
83
97
 
84
- if service_account_key:
85
- self.service_account_key = service_account_key
98
+ self.service_account_key: Optional[SecretStr] = None
99
+ if service_account_key is None:
100
+ self.service_account_key = settings.GOOGLE_SERVICE_ACCOUNT_KEY
101
+ elif isinstance(service_account_key, dict):
102
+ self.service_account_key = SecretStr(
103
+ json.dumps(service_account_key)
104
+ )
86
105
  else:
87
- service_account_key_data = settings.GOOGLE_SERVICE_ACCOUNT_KEY
88
- if service_account_key_data is None:
89
- self.service_account_key = None
90
- elif isinstance(service_account_key_data, str):
91
- self.service_account_key = json.loads(service_account_key_data)
106
+ str_value = str(service_account_key).strip()
107
+ self.service_account_key = (
108
+ SecretStr(str_value) if str_value else None
109
+ )
92
110
 
93
111
  if temperature < 0:
94
- raise ValueError("Temperature must be >= 0.")
112
+ raise DeepEvalError("Temperature must be >= 0.")
113
+
95
114
  self.temperature = temperature
96
115
 
97
116
  # Raw kwargs destined for the underlying Client
98
117
  self.kwargs = kwargs
99
- self.generation_kwargs = generation_kwargs or {}
118
+ self.kwargs.pop("temperature", None)
100
119
 
120
+ self.generation_kwargs = dict(generation_kwargs or {})
121
+ self.generation_kwargs.pop("temperature", None)
122
+
123
+ self._module = self._require_module()
101
124
  # Configure default model generation settings
102
125
  self.model_safety_settings = [
103
- types.SafetySetting(
104
- category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
105
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
126
+ self._module.types.SafetySetting(
127
+ category=self._module.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
128
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
106
129
  ),
107
- types.SafetySetting(
108
- category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
109
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
130
+ self._module.types.SafetySetting(
131
+ category=self._module.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
132
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
110
133
  ),
111
- types.SafetySetting(
112
- category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
113
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
134
+ self._module.types.SafetySetting(
135
+ category=self._module.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
136
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
114
137
  ),
115
- types.SafetySetting(
116
- category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
117
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
138
+ self._module.types.SafetySetting(
139
+ category=self._module.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
140
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
118
141
  ),
119
142
  ]
120
143
 
121
- super().__init__(model_name, **kwargs)
144
+ super().__init__(model)
122
145
 
123
146
  def should_use_vertexai(self) -> bool:
124
147
  """Checks if the model should use Vertex AI for generation.
@@ -131,18 +154,73 @@ class GeminiModel(DeepEvalBaseLLM):
131
154
  True if the model should use Vertex AI, False otherwise
132
155
  """
133
156
  if self.use_vertexai is not None:
134
- return self.use_vertexai.lower() == "yes"
157
+ return self.use_vertexai
135
158
  if self.project and self.location:
136
159
  return True
137
160
  else:
138
161
  return False
139
162
 
163
+ @retry_gemini
164
+ def generate_content(
165
+ self, multimodal_input: Optional[List[Union[str, MLLMImage]]] = None
166
+ ):
167
+ multimodal_input = (
168
+ multimodal_input if multimodal_input is not None else []
169
+ )
170
+ content = []
171
+
172
+ for element in multimodal_input:
173
+ if isinstance(element, str):
174
+ content.append(element)
175
+ elif isinstance(element, MLLMImage):
176
+ # Gemini doesn't support direct external URLs
177
+ # Must convert all images to bytes
178
+ if element.url and not element.local:
179
+ import requests
180
+
181
+ settings = get_settings()
182
+
183
+ response = requests.get(
184
+ element.url,
185
+ timeout=(
186
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
187
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
188
+ ),
189
+ )
190
+ response.raise_for_status()
191
+ image_data = response.content
192
+ mime_type = response.headers.get(
193
+ "content-type", element.mimeType or "image/jpeg"
194
+ )
195
+ else:
196
+ element.ensure_images_loaded()
197
+ try:
198
+ image_data = base64.b64decode(element.dataBase64)
199
+ except Exception:
200
+ raise ValueError(
201
+ f"Invalid base64 data in MLLMImage: {element._id}"
202
+ )
203
+
204
+ mime_type = element.mimeType or "image/jpeg"
205
+
206
+ # Create Part from bytes
207
+ image_part = self._module.types.Part.from_bytes(
208
+ data=image_data, mime_type=mime_type
209
+ )
210
+ content.append(image_part)
211
+ else:
212
+ raise DeepEvalError(f"Invalid input type: {type(element)}")
213
+
214
+ return content
215
+
140
216
  ###############################################
141
217
  # Generate functions
142
218
  ###############################################
143
219
 
144
220
  @retry_gemini
145
- def generate(self, prompt: str, schema: Optional[BaseModel] = None) -> str:
221
+ def generate(
222
+ self, prompt: str, schema: Optional[BaseModel] = None
223
+ ) -> Tuple[Union[str, BaseModel], float]:
146
224
  """Generates text from a prompt.
147
225
 
148
226
  Args:
@@ -154,11 +232,15 @@ class GeminiModel(DeepEvalBaseLLM):
154
232
  """
155
233
  client = self.load_model()
156
234
 
235
+ if check_if_multimodal(prompt):
236
+ prompt = convert_to_multi_modal_array(prompt)
237
+ prompt = self.generate_content(prompt)
238
+
157
239
  if schema is not None:
158
240
  response = client.models.generate_content(
159
- model=self.model_name,
241
+ model=self.name,
160
242
  contents=prompt,
161
- config=types.GenerateContentConfig(
243
+ config=self._module.types.GenerateContentConfig(
162
244
  response_mime_type="application/json",
163
245
  response_schema=schema,
164
246
  safety_settings=self.model_safety_settings,
@@ -169,9 +251,9 @@ class GeminiModel(DeepEvalBaseLLM):
169
251
  return response.parsed, 0
170
252
  else:
171
253
  response = client.models.generate_content(
172
- model=self.model_name,
254
+ model=self.name,
173
255
  contents=prompt,
174
- config=types.GenerateContentConfig(
256
+ config=self._module.types.GenerateContentConfig(
175
257
  safety_settings=self.model_safety_settings,
176
258
  temperature=self.temperature,
177
259
  **self.generation_kwargs,
@@ -182,7 +264,7 @@ class GeminiModel(DeepEvalBaseLLM):
182
264
  @retry_gemini
183
265
  async def a_generate(
184
266
  self, prompt: str, schema: Optional[BaseModel] = None
185
- ) -> str:
267
+ ) -> Tuple[Union[str, BaseModel], float]:
186
268
  """Asynchronously generates text from a prompt.
187
269
 
188
270
  Args:
@@ -194,11 +276,15 @@ class GeminiModel(DeepEvalBaseLLM):
194
276
  """
195
277
  client = self.load_model()
196
278
 
279
+ if check_if_multimodal(prompt):
280
+ prompt = convert_to_multi_modal_array(prompt)
281
+ prompt = self.generate_content(prompt)
282
+
197
283
  if schema is not None:
198
284
  response = await client.aio.models.generate_content(
199
- model=self.model_name,
285
+ model=self.name,
200
286
  contents=prompt,
201
- config=types.GenerateContentConfig(
287
+ config=self._module.types.GenerateContentConfig(
202
288
  response_mime_type="application/json",
203
289
  response_schema=schema,
204
290
  safety_settings=self.model_safety_settings,
@@ -209,9 +295,9 @@ class GeminiModel(DeepEvalBaseLLM):
209
295
  return response.parsed, 0
210
296
  else:
211
297
  response = await client.aio.models.generate_content(
212
- model=self.model_name,
298
+ model=self.name,
213
299
  contents=prompt,
214
- config=types.GenerateContentConfig(
300
+ config=self._module.types.GenerateContentConfig(
215
301
  safety_settings=self.model_safety_settings,
216
302
  temperature=self.temperature,
217
303
  **self.generation_kwargs,
@@ -219,15 +305,37 @@ class GeminiModel(DeepEvalBaseLLM):
219
305
  )
220
306
  return response.text, 0
221
307
 
308
+ #########################
309
+ # Capabilities #
310
+ #########################
311
+
312
+ def supports_log_probs(self) -> Union[bool, None]:
313
+ return self.model_data.supports_log_probs
314
+
315
+ def supports_temperature(self) -> Union[bool, None]:
316
+ return self.model_data.supports_temperature
317
+
318
+ def supports_multimodal(self) -> Union[bool, None]:
319
+ return self.model_data.supports_multimodal
320
+
321
+ def supports_structured_outputs(self) -> Union[bool, None]:
322
+ """
323
+ OpenAI models that natively enforce typed structured outputs.
324
+ Used by generate(...) when a schema is provided.
325
+ """
326
+ return self.model_data.supports_structured_outputs
327
+
328
+ def supports_json_mode(self) -> Union[bool, None]:
329
+ """
330
+ OpenAI models that enforce JSON mode
331
+ """
332
+ return self.model_data.supports_json
333
+
222
334
  #########
223
335
  # Model #
224
336
  #########
225
337
 
226
- def get_model_name(self) -> str:
227
- """Returns the name of the Gemini model being used."""
228
- return self.model_name
229
-
230
- def load_model(self, *args, **kwargs):
338
+ def load_model(self):
231
339
  """Creates a client.
232
340
  With Gen AI SDK, model is set at inference time, so there is no
233
341
  model to load and initialize.
@@ -236,7 +344,21 @@ class GeminiModel(DeepEvalBaseLLM):
236
344
  Returns:
237
345
  A GenerativeModel instance configured for evaluation.
238
346
  """
239
- return self._build_client(**kwargs)
347
+ return self._build_client()
348
+
349
+ def _require_oauth2(self):
350
+ return require_dependency(
351
+ "google.oauth2",
352
+ provider_label="GeminiModel",
353
+ install_hint="Install it with `pip install google-auth`.",
354
+ )
355
+
356
+ def _require_module(self):
357
+ return require_dependency(
358
+ "google.genai",
359
+ provider_label="GeminiModel",
360
+ install_hint="Install it with `pip install google-genai`.",
361
+ )
240
362
 
241
363
  def _client_kwargs(self, **override_kwargs) -> Dict:
242
364
  """Merge ctor kwargs with any overrides passed at load_model time."""
@@ -245,29 +367,49 @@ class GeminiModel(DeepEvalBaseLLM):
245
367
  client_kwargs.update(override_kwargs)
246
368
  return client_kwargs
247
369
 
248
- def _build_client(self, **override_kwargs) -> Client:
249
- client_kwargs = self._client_kwargs(**override_kwargs)
370
+ def _build_client(self) -> "Client":
371
+ client_kwargs = self._client_kwargs(**self.kwargs)
250
372
 
251
373
  if self.should_use_vertexai():
374
+ service_account_key_json = require_secret_api_key(
375
+ self.service_account_key,
376
+ provider_label="Google Gemini",
377
+ env_var_name="GOOGLE_SERVICE_ACCOUNT_KEY",
378
+ param_hint="`service_account_key` to GeminiModel(...)",
379
+ )
380
+
381
+ try:
382
+ service_account_key = json.loads(service_account_key_json)
383
+ except Exception as e:
384
+ raise DeepEvalError(
385
+ "GOOGLE_SERVICE_ACCOUNT_KEY must be valid JSON for a Google service account."
386
+ ) from e
387
+
388
+ if not isinstance(service_account_key, dict):
389
+ raise DeepEvalError(
390
+ "GOOGLE_SERVICE_ACCOUNT_KEY must decode to a JSON object."
391
+ )
392
+
252
393
  if not self.project or not self.location:
253
- raise ValueError(
394
+ raise DeepEvalError(
254
395
  "When using Vertex AI API, both project and location are required. "
255
396
  "Either provide them as arguments or set GOOGLE_CLOUD_PROJECT and "
256
397
  "GOOGLE_CLOUD_LOCATION in your DeepEval configuration."
257
398
  )
258
399
 
400
+ oauth2 = self._require_oauth2()
259
401
  credentials = (
260
- service_account.Credentials.from_service_account_info(
261
- self.service_account_key,
402
+ oauth2.service_account.Credentials.from_service_account_info(
403
+ service_account_key,
262
404
  scopes=[
263
405
  "https://www.googleapis.com/auth/cloud-platform",
264
406
  ],
265
407
  )
266
- if self.service_account_key
408
+ if service_account_key
267
409
  else None
268
410
  )
269
411
 
270
- client = Client(
412
+ client = self._module.Client(
271
413
  vertexai=True,
272
414
  project=self.project,
273
415
  location=self.location,
@@ -282,6 +424,9 @@ class GeminiModel(DeepEvalBaseLLM):
282
424
  param_hint="`api_key` to GeminiModel(...)",
283
425
  )
284
426
 
285
- client = Client(api_key=api_key, **client_kwargs)
427
+ client = self._module.Client(api_key=api_key, **client_kwargs)
286
428
 
287
429
  return client
430
+
431
+ def get_model_name(self):
432
+ return f"{self.name} (Gemini)"