deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,15 +1,32 @@
1
- from pydantic import BaseModel
2
- from google.genai import types, Client
3
- from typing import Optional, Dict
1
+ import json
2
+ import requests
3
+ from pydantic import BaseModel, SecretStr
4
+ from typing import TYPE_CHECKING, Optional, Dict, List, Union
4
5
 
6
+ from deepeval.test_case import MLLMImage
7
+ from deepeval.config.settings import get_settings
8
+ from deepeval.models.utils import require_secret_api_key
5
9
  from deepeval.models.retry_policy import (
6
10
  create_retry_decorator,
7
11
  )
8
- from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
12
+ from deepeval.utils import (
13
+ convert_to_multi_modal_array,
14
+ check_if_multimodal,
15
+ require_dependency,
16
+ )
9
17
  from deepeval.models.base_model import DeepEvalBaseLLM
10
18
  from deepeval.constants import ProviderSlug as PS
11
- from google.oauth2 import service_account
12
- import json
19
+
20
+ valid_multimodal_models = [
21
+ "gemini-2.5-pro",
22
+ "gemini-2.5-flash",
23
+ "gemini-1.5-pro",
24
+ "gemini-1.5-flash",
25
+ # TODO: Add more models later
26
+ ]
27
+
28
+ if TYPE_CHECKING:
29
+ from google.genai import Client
13
30
 
14
31
  default_gemini_model = "gemini-1.5-pro"
15
32
 
@@ -26,7 +43,7 @@ class GeminiModel(DeepEvalBaseLLM):
26
43
  To use Vertex AI API, set project and location attributes.
27
44
 
28
45
  Attributes:
29
- model_name: Name of the Gemini model to use
46
+ model: Name of the Gemini model to use
30
47
  api_key: Google API key for authentication
31
48
  project: Google Cloud project ID
32
49
  location: Google Cloud location
@@ -37,7 +54,7 @@ class GeminiModel(DeepEvalBaseLLM):
37
54
 
38
55
  # Initialize the model
39
56
  model = GeminiModel(
40
- model_name="gemini-1.5-pro-001",
57
+ model="gemini-1.5-pro-001",
41
58
  api_key="your-api-key"
42
59
  )
43
60
 
@@ -48,40 +65,39 @@ class GeminiModel(DeepEvalBaseLLM):
48
65
 
49
66
  def __init__(
50
67
  self,
51
- model_name: Optional[str] = None,
68
+ model: Optional[str] = None,
52
69
  api_key: Optional[str] = None,
70
+ temperature: float = 0,
53
71
  project: Optional[str] = None,
54
72
  location: Optional[str] = None,
55
73
  service_account_key: Optional[Dict[str, str]] = None,
56
- temperature: float = 0,
57
74
  generation_kwargs: Optional[Dict] = None,
58
75
  **kwargs,
59
76
  ):
60
- model_name = (
61
- model_name
62
- or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.GEMINI_MODEL_NAME)
63
- or default_gemini_model
64
- )
65
77
 
66
- # Get API key from key handler if not provided
67
- self.api_key = api_key or KEY_FILE_HANDLER.fetch_data(
68
- ModelKeyValues.GOOGLE_API_KEY
69
- )
70
- self.project = project or KEY_FILE_HANDLER.fetch_data(
71
- ModelKeyValues.GOOGLE_CLOUD_PROJECT
72
- )
73
- self.location = location or KEY_FILE_HANDLER.fetch_data(
74
- ModelKeyValues.GOOGLE_CLOUD_LOCATION
75
- )
76
- self.use_vertexai = KEY_FILE_HANDLER.fetch_data(
77
- ModelKeyValues.GOOGLE_GENAI_USE_VERTEXAI
78
+ settings = get_settings()
79
+
80
+ model = model or settings.GEMINI_MODEL_NAME or default_gemini_model
81
+
82
+ # Get API key from settings if not provided
83
+ if api_key is not None:
84
+ # keep it secret, keep it safe from serializings, logging and aolike
85
+ self.api_key: SecretStr | None = SecretStr(api_key)
86
+ else:
87
+ self.api_key = settings.GOOGLE_API_KEY
88
+
89
+ self.project = project or settings.GOOGLE_CLOUD_PROJECT
90
+ self.location = (
91
+ location
92
+ or settings.GOOGLE_CLOUD_LOCATION is not None
93
+ and str(settings.GOOGLE_CLOUD_LOCATION)
78
94
  )
95
+ self.use_vertexai = settings.GOOGLE_GENAI_USE_VERTEXAI
96
+
79
97
  if service_account_key:
80
98
  self.service_account_key = service_account_key
81
99
  else:
82
- service_account_key_data = KEY_FILE_HANDLER.fetch_data(
83
- ModelKeyValues.GOOGLE_SERVICE_ACCOUNT_KEY
84
- )
100
+ service_account_key_data = settings.GOOGLE_SERVICE_ACCOUNT_KEY
85
101
  if service_account_key_data is None:
86
102
  self.service_account_key = None
87
103
  elif isinstance(service_account_key_data, str):
@@ -90,11 +106,35 @@ class GeminiModel(DeepEvalBaseLLM):
90
106
  if temperature < 0:
91
107
  raise ValueError("Temperature must be >= 0.")
92
108
  self.temperature = temperature
109
+
110
+ # Raw kwargs destined for the underlying Client
93
111
  self.kwargs = kwargs
94
112
  self.generation_kwargs = generation_kwargs or {}
95
- super().__init__(model_name, **kwargs)
96
113
 
97
- def should_use_vertexai(self):
114
+ self._module = self._require_module()
115
+ # Configure default model generation settings
116
+ self.model_safety_settings = [
117
+ self._module.types.SafetySetting(
118
+ category=self._module.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
119
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
120
+ ),
121
+ self._module.types.SafetySetting(
122
+ category=self._module.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
123
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
124
+ ),
125
+ self._module.types.SafetySetting(
126
+ category=self._module.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
127
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
128
+ ),
129
+ self._module.types.SafetySetting(
130
+ category=self._module.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
131
+ threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
132
+ ),
133
+ ]
134
+
135
+ super().__init__(model)
136
+
137
+ def should_use_vertexai(self) -> bool:
98
138
  """Checks if the model should use Vertex AI for generation.
99
139
 
100
140
  This is determined first by the value of `GOOGLE_GENAI_USE_VERTEXAI`
@@ -111,69 +151,53 @@ class GeminiModel(DeepEvalBaseLLM):
111
151
  else:
112
152
  return False
113
153
 
114
- def load_model(self, *args, **kwargs):
115
- """Creates a client.
116
- With Gen AI SDK, model is set at inference time, so there is no
117
- model to load and initialize.
118
- This method name is kept for compatibility with other LLMs.
154
+ @retry_gemini
155
+ def generate_prompt(
156
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
157
+ ) -> List[Union[str, MLLMImage]]:
158
+ """Converts DeepEval multimodal input into GenAI SDK compatible format.
159
+
160
+ Args:
161
+ multimodal_input: List of strings and MLLMImage objects
119
162
 
120
163
  Returns:
121
- A GenerativeModel instance configured for evaluation.
164
+ List of strings and PIL Image objects ready for model input
165
+
166
+ Raises:
167
+ ValueError: If an invalid input type is provided
122
168
  """
123
- if self.should_use_vertexai():
124
- if not self.project or not self.location:
125
- raise ValueError(
126
- "When using Vertex AI API, both project and location are required."
127
- "Either provide them as arguments or set GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables, "
128
- "or set them in your DeepEval configuration."
129
- )
169
+ prompt = []
170
+ settings = get_settings()
130
171
 
131
- # Create client for Vertex AI
132
- self.client = Client(
133
- vertexai=True,
134
- project=self.project,
135
- location=self.location,
136
- credentials=(
137
- service_account.Credentials.from_service_account_info(
138
- self.service_account_key,
139
- scopes=[
140
- "https://www.googleapis.com/auth/cloud-platform"
141
- ],
172
+ for ele in multimodal_input:
173
+ if isinstance(ele, str):
174
+ prompt.append(ele)
175
+ elif isinstance(ele, MLLMImage):
176
+ if ele.local:
177
+ with open(ele.url, "rb") as f:
178
+ image_data = f.read()
179
+ else:
180
+ response = requests.get(
181
+ ele.url,
182
+ timeout=(
183
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
184
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
185
+ ),
142
186
  )
143
- if self.service_account_key
144
- else None
145
- ),
146
- **self.kwargs,
147
- )
148
- else:
149
- if not self.api_key:
150
- raise ValueError(
151
- "Google API key is required. Either provide it directly, set GOOGLE_API_KEY environment variable, "
152
- "or set it in your DeepEval configuration."
187
+ response.raise_for_status()
188
+ image_data = response.content
189
+
190
+ image_part = self._module.types.Part.from_bytes(
191
+ data=image_data, mime_type="image/jpeg"
153
192
  )
154
- # Create client for Gemini API
155
- self.client = Client(api_key=self.api_key, **self.kwargs)
193
+ prompt.append(image_part)
194
+ else:
195
+ raise ValueError(f"Invalid input type: {type(ele)}")
196
+ return prompt
156
197
 
157
- # Configure default model generation settings
158
- self.model_safety_settings = [
159
- types.SafetySetting(
160
- category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
161
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
162
- ),
163
- types.SafetySetting(
164
- category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
165
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
166
- ),
167
- types.SafetySetting(
168
- category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
169
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
170
- ),
171
- types.SafetySetting(
172
- category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
173
- threshold=types.HarmBlockThreshold.BLOCK_NONE,
174
- ),
175
- ]
176
- return self.client.models
198
+ ###############################################
199
+ # Generate functions
200
+ ###############################################
177
201
 
178
202
  @retry_gemini
179
203
  def generate(self, prompt: str, schema: Optional[BaseModel] = None) -> str:
@@ -186,11 +210,18 @@ class GeminiModel(DeepEvalBaseLLM):
186
210
  Returns:
187
211
  Generated text response or structured output as Pydantic model
188
212
  """
213
+ client = self.load_model()
214
+
215
+ if check_if_multimodal(prompt):
216
+
217
+ prompt = convert_to_multi_modal_array(prompt)
218
+ prompt = self.generate_prompt(prompt)
219
+
189
220
  if schema is not None:
190
- response = self.client.models.generate_content(
191
- model=self.model_name,
221
+ response = client.models.generate_content(
222
+ model=self.name,
192
223
  contents=prompt,
193
- config=types.GenerateContentConfig(
224
+ config=self._module.types.GenerateContentConfig(
194
225
  response_mime_type="application/json",
195
226
  response_schema=schema,
196
227
  safety_settings=self.model_safety_settings,
@@ -200,10 +231,10 @@ class GeminiModel(DeepEvalBaseLLM):
200
231
  )
201
232
  return response.parsed, 0
202
233
  else:
203
- response = self.client.models.generate_content(
204
- model=self.model_name,
234
+ response = client.models.generate_content(
235
+ model=self.name,
205
236
  contents=prompt,
206
- config=types.GenerateContentConfig(
237
+ config=self._module.types.GenerateContentConfig(
207
238
  safety_settings=self.model_safety_settings,
208
239
  temperature=self.temperature,
209
240
  **self.generation_kwargs,
@@ -224,11 +255,17 @@ class GeminiModel(DeepEvalBaseLLM):
224
255
  Returns:
225
256
  Generated text response or structured output as Pydantic model
226
257
  """
258
+ client = self.load_model()
259
+
260
+ if check_if_multimodal(prompt):
261
+ prompt = convert_to_multi_modal_array(prompt)
262
+ prompt = self.generate_prompt(prompt)
263
+
227
264
  if schema is not None:
228
- response = await self.client.aio.models.generate_content(
229
- model=self.model_name,
265
+ response = await client.aio.models.generate_content(
266
+ model=self.name,
230
267
  contents=prompt,
231
- config=types.GenerateContentConfig(
268
+ config=self._module.types.GenerateContentConfig(
232
269
  response_mime_type="application/json",
233
270
  response_schema=schema,
234
271
  safety_settings=self.model_safety_settings,
@@ -238,10 +275,10 @@ class GeminiModel(DeepEvalBaseLLM):
238
275
  )
239
276
  return response.parsed, 0
240
277
  else:
241
- response = await self.client.aio.models.generate_content(
242
- model=self.model_name,
278
+ response = await client.aio.models.generate_content(
279
+ model=self.name,
243
280
  contents=prompt,
244
- config=types.GenerateContentConfig(
281
+ config=self._module.types.GenerateContentConfig(
245
282
  safety_settings=self.model_safety_settings,
246
283
  temperature=self.temperature,
247
284
  **self.generation_kwargs,
@@ -249,6 +286,88 @@ class GeminiModel(DeepEvalBaseLLM):
249
286
  )
250
287
  return response.text, 0
251
288
 
252
- def get_model_name(self) -> str:
253
- """Returns the name of the Gemini model being used."""
254
- return self.model_name
289
+ #########
290
+ # Model #
291
+ #########
292
+
293
+ def load_model(self):
294
+ """Creates a client.
295
+ With Gen AI SDK, model is set at inference time, so there is no
296
+ model to load and initialize.
297
+ This method name is kept for compatibility with other LLMs.
298
+
299
+ Returns:
300
+ A GenerativeModel instance configured for evaluation.
301
+ """
302
+ return self._build_client()
303
+
304
+ def _require_oauth2(self):
305
+ return require_dependency(
306
+ "google.oauth2",
307
+ provider_label="GeminiModel",
308
+ install_hint="Install it with `pip install google-auth`.",
309
+ )
310
+
311
+ def _require_module(self):
312
+ return require_dependency(
313
+ "google.genai",
314
+ provider_label="GeminiModel",
315
+ install_hint="Install it with `pip install google-genai`.",
316
+ )
317
+
318
+ def _client_kwargs(self, **override_kwargs) -> Dict:
319
+ """Merge ctor kwargs with any overrides passed at load_model time."""
320
+ client_kwargs = dict(self.kwargs or {})
321
+ if override_kwargs:
322
+ client_kwargs.update(override_kwargs)
323
+ return client_kwargs
324
+
325
+ def _build_client(self) -> "Client":
326
+ client_kwargs = self._client_kwargs(**self.kwargs)
327
+
328
+ if self.should_use_vertexai():
329
+ if not self.project or not self.location:
330
+ raise ValueError(
331
+ "When using Vertex AI API, both project and location are required. "
332
+ "Either provide them as arguments or set GOOGLE_CLOUD_PROJECT and "
333
+ "GOOGLE_CLOUD_LOCATION in your DeepEval configuration."
334
+ )
335
+
336
+ oauth2 = self._require_oauth2()
337
+ credentials = (
338
+ oauth2.service_account.Credentials.from_service_account_info(
339
+ self.service_account_key,
340
+ scopes=[
341
+ "https://www.googleapis.com/auth/cloud-platform",
342
+ ],
343
+ )
344
+ if self.service_account_key
345
+ else None
346
+ )
347
+
348
+ client = self._module.Client(
349
+ vertexai=True,
350
+ project=self.project,
351
+ location=self.location,
352
+ credentials=credentials,
353
+ **client_kwargs,
354
+ )
355
+ else:
356
+ api_key = require_secret_api_key(
357
+ self.api_key,
358
+ provider_label="Google Gemini",
359
+ env_var_name="GOOGLE_API_KEY",
360
+ param_hint="`api_key` to GeminiModel(...)",
361
+ )
362
+
363
+ client = self._module.Client(api_key=api_key, **client_kwargs)
364
+
365
+ return client
366
+
367
+ def supports_multimodal(self):
368
+ if self.name in valid_multimodal_models:
369
+ return True
370
+ return False
371
+
372
+ def get_model_name(self):
373
+ return f"{self.name} (Gemini)"
@@ -1,18 +1,18 @@
1
- import os
2
-
3
1
  from typing import Optional, Tuple, Union, Dict
4
- from pydantic import BaseModel
2
+ from pydantic import BaseModel, SecretStr
5
3
 
4
+ from deepeval.config.settings import get_settings
6
5
  from deepeval.models.retry_policy import (
7
6
  create_retry_decorator,
8
7
  sdk_retries_for,
9
8
  )
10
- from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
11
9
  from deepeval.models.llms.utils import trim_and_load_json
10
+ from deepeval.models.utils import (
11
+ require_secret_api_key,
12
+ )
12
13
  from deepeval.models import DeepEvalBaseLLM
13
14
  from deepeval.constants import ProviderSlug as PS
14
15
 
15
-
16
16
  # consistent retry rules
17
17
  retry_grok = create_retry_decorator(PS.GROK)
18
18
 
@@ -62,30 +62,33 @@ class GrokModel(DeepEvalBaseLLM):
62
62
  generation_kwargs: Optional[Dict] = None,
63
63
  **kwargs,
64
64
  ):
65
- model_name = model or KEY_FILE_HANDLER.fetch_data(
66
- ModelKeyValues.GROK_MODEL_NAME
67
- )
68
- if model_name not in model_pricing:
65
+
66
+ settings = get_settings()
67
+
68
+ model = model or settings.GROK_MODEL_NAME
69
+
70
+ if model not in model_pricing:
69
71
  raise ValueError(
70
72
  f"Invalid model. Available Grok models: {', '.join(model_pricing.keys())}"
71
73
  )
72
- temperature_from_key = KEY_FILE_HANDLER.fetch_data(
73
- ModelKeyValues.TEMPERATURE
74
- )
74
+ temperature_from_key = settings.TEMPERATURE
75
75
  if temperature_from_key is None:
76
76
  self.temperature = temperature
77
77
  else:
78
78
  self.temperature = float(temperature_from_key)
79
79
  if self.temperature < 0:
80
80
  raise ValueError("Temperature must be >= 0.")
81
- self.api_key = (
82
- api_key
83
- or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.GROK_API_KEY)
84
- or os.getenv("GROK_API_KEY")
85
- )
81
+
82
+ if api_key is not None:
83
+ # keep it secret, keep it safe from serializings, logging and alike
84
+ self.api_key: SecretStr | None = SecretStr(api_key)
85
+ else:
86
+ self.api_key = settings.GROK_API_KEY
87
+
88
+ # Keep sanitized kwargs for client call to strip legacy keys
86
89
  self.kwargs = kwargs
87
90
  self.generation_kwargs = generation_kwargs or {}
88
- super().__init__(model_name)
91
+ super().__init__(model)
89
92
 
90
93
  ###############################################
91
94
  # Other generate functions
@@ -95,6 +98,7 @@ class GrokModel(DeepEvalBaseLLM):
95
98
  def generate(
96
99
  self, prompt: str, schema: Optional[BaseModel] = None
97
100
  ) -> Tuple[Union[str, Dict], float]:
101
+
98
102
  try:
99
103
  from xai_sdk.chat import user
100
104
  except ImportError:
@@ -103,13 +107,13 @@ class GrokModel(DeepEvalBaseLLM):
103
107
  )
104
108
  client = self.load_model(async_mode=False)
105
109
  chat = client.chat.create(
106
- model=self.model_name,
110
+ model=self.name,
107
111
  temperature=self.temperature,
108
112
  **self.generation_kwargs,
109
113
  )
110
114
  chat.append(user(prompt))
111
115
 
112
- if schema and self.model_name in structured_outputs_models:
116
+ if schema and self.name in structured_outputs_models:
113
117
  response, structured_output = chat.parse(schema)
114
118
  cost = self.calculate_cost(
115
119
  response.usage.prompt_tokens,
@@ -133,6 +137,7 @@ class GrokModel(DeepEvalBaseLLM):
133
137
  async def a_generate(
134
138
  self, prompt: str, schema: Optional[BaseModel] = None
135
139
  ) -> Tuple[Union[str, Dict], float]:
140
+
136
141
  try:
137
142
  from xai_sdk.chat import user
138
143
  except ImportError:
@@ -141,13 +146,13 @@ class GrokModel(DeepEvalBaseLLM):
141
146
  )
142
147
  client = self.load_model(async_mode=True)
143
148
  chat = client.chat.create(
144
- model=self.model_name,
149
+ model=self.name,
145
150
  temperature=self.temperature,
146
151
  **self.generation_kwargs,
147
152
  )
148
153
  chat.append(user(prompt))
149
154
 
150
- if schema and self.model_name in structured_outputs_models:
155
+ if schema and self.name in structured_outputs_models:
151
156
  response, structured_output = await chat.parse(schema)
152
157
  cost = self.calculate_cost(
153
158
  response.usage.prompt_tokens,
@@ -176,7 +181,7 @@ class GrokModel(DeepEvalBaseLLM):
176
181
  input_tokens: int,
177
182
  output_tokens: int,
178
183
  ) -> float:
179
- pricing = model_pricing.get(self.model_name, model_pricing)
184
+ pricing = model_pricing.get(self.name, model_pricing)
180
185
  input_cost = input_tokens * pricing["input"]
181
186
  output_cost = output_tokens * pricing["output"]
182
187
  return input_cost + output_cost
@@ -198,9 +203,6 @@ class GrokModel(DeepEvalBaseLLM):
198
203
  "xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
199
204
  )
200
205
 
201
- def get_model_name(self):
202
- return f"{self.model_name}"
203
-
204
206
  def _client_kwargs(self) -> Dict:
205
207
  """
206
208
  If Tenacity is managing retries, disable gRPC channel retries to avoid double retry.
@@ -226,7 +228,14 @@ class GrokModel(DeepEvalBaseLLM):
226
228
  return kwargs
227
229
 
228
230
  def _build_client(self, cls):
229
- kw = dict(api_key=self.api_key, **self._client_kwargs())
231
+ api_key = require_secret_api_key(
232
+ self.api_key,
233
+ provider_label="Grok",
234
+ env_var_name="GROK_API_KEY",
235
+ param_hint="`api_key` to GrokModel(...)",
236
+ )
237
+
238
+ kw = dict(api_key=api_key, **self._client_kwargs())
230
239
  try:
231
240
  return cls(**kw)
232
241
  except TypeError as e:
@@ -235,3 +244,6 @@ class GrokModel(DeepEvalBaseLLM):
235
244
  kw.pop("channel_options", None)
236
245
  return cls(**kw)
237
246
  raise
247
+
248
+ def get_model_name(self):
249
+ return f"{self.name} (Grok)"