deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,15 +1,32 @@
1
- from ollama import Client, AsyncClient, ChatResponse
2
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import TYPE_CHECKING, Optional, Tuple, Union, Dict, List
3
2
  from pydantic import BaseModel
3
+ import requests
4
+ import base64
5
+ import io
4
6
 
7
+ from deepeval.config.settings import get_settings
8
+ from deepeval.utils import require_dependency
5
9
  from deepeval.models.retry_policy import (
6
10
  create_retry_decorator,
7
11
  )
8
-
12
+ from deepeval.utils import convert_to_multi_modal_array, check_if_multimodal
13
+ from deepeval.test_case import MLLMImage
9
14
  from deepeval.models import DeepEvalBaseLLM
10
- from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
11
15
  from deepeval.constants import ProviderSlug as PS
12
16
 
17
+ valid_multimodal_models = [
18
+ "llava:7b",
19
+ "llava:13b",
20
+ "llava:34b",
21
+ "llama4",
22
+ "gemma3",
23
+ "qwen3-vl",
24
+ "qwen2.5-vl",
25
+ # TODO: Add more models later on by looking at their catelogue
26
+ ]
27
+
28
+ if TYPE_CHECKING:
29
+ from ollama import ChatResponse
13
30
 
14
31
  retry_ollama = create_retry_decorator(PS.OLLAMA)
15
32
 
@@ -23,20 +40,23 @@ class OllamaModel(DeepEvalBaseLLM):
23
40
  generation_kwargs: Optional[Dict] = None,
24
41
  **kwargs,
25
42
  ):
26
- model_name = model or KEY_FILE_HANDLER.fetch_data(
27
- ModelKeyValues.LOCAL_MODEL_NAME
28
- )
43
+ settings = get_settings()
44
+ model = model or settings.LOCAL_MODEL_NAME
29
45
  self.base_url = (
30
46
  base_url
31
- or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.LOCAL_MODEL_BASE_URL)
47
+ or (
48
+ settings.LOCAL_MODEL_BASE_URL
49
+ and str(settings.LOCAL_MODEL_BASE_URL)
50
+ )
32
51
  or "http://localhost:11434"
33
52
  )
34
53
  if temperature < 0:
35
54
  raise ValueError("Temperature must be >= 0.")
36
55
  self.temperature = temperature
56
+ # Keep sanitized kwargs for client call to strip legacy keys
37
57
  self.kwargs = kwargs
38
58
  self.generation_kwargs = generation_kwargs or {}
39
- super().__init__(model_name)
59
+ super().__init__(model)
40
60
 
41
61
  ###############################################
42
62
  # Other generate functions
@@ -47,9 +67,17 @@ class OllamaModel(DeepEvalBaseLLM):
47
67
  self, prompt: str, schema: Optional[BaseModel] = None
48
68
  ) -> Tuple[Union[str, Dict], float]:
49
69
  chat_model = self.load_model()
70
+
71
+ if check_if_multimodal(prompt):
72
+ prompt = convert_to_multi_modal_array(prompt)
73
+ messages = self.generate_messages(prompt)
74
+ else:
75
+ messages = [{"role": "user", "content": prompt}]
76
+ print(messages)
77
+
50
78
  response: ChatResponse = chat_model.chat(
51
- model=self.model_name,
52
- messages=[{"role": "user", "content": prompt}],
79
+ model=self.name,
80
+ messages=messages,
53
81
  format=schema.model_json_schema() if schema else None,
54
82
  options={
55
83
  **{"temperature": self.temperature},
@@ -70,9 +98,16 @@ class OllamaModel(DeepEvalBaseLLM):
70
98
  self, prompt: str, schema: Optional[BaseModel] = None
71
99
  ) -> Tuple[str, float]:
72
100
  chat_model = self.load_model(async_mode=True)
101
+
102
+ if check_if_multimodal(prompt):
103
+ prompt = convert_to_multi_modal_array(prompt)
104
+ messages = self.generate_messages(prompt)
105
+ else:
106
+ messages = [{"role": "user", "content": prompt}]
107
+
73
108
  response: ChatResponse = await chat_model.chat(
74
- model=self.model_name,
75
- messages=[{"role": "user", "content": prompt}],
109
+ model=self.name,
110
+ messages=messages,
76
111
  format=schema.model_json_schema() if schema else None,
77
112
  options={
78
113
  **{"temperature": self.temperature},
@@ -88,17 +123,94 @@ class OllamaModel(DeepEvalBaseLLM):
88
123
  0,
89
124
  )
90
125
 
126
+ def generate_messages(
127
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
128
+ ):
129
+ messages = []
130
+ for ele in multimodal_input:
131
+ if isinstance(ele, str):
132
+ messages.append(
133
+ {
134
+ "role": "user",
135
+ "content": ele,
136
+ }
137
+ )
138
+ elif isinstance(ele, MLLMImage):
139
+ img_b64 = self.convert_to_base64(ele.url, ele.local)
140
+ if img_b64 is not None:
141
+ messages.append(
142
+ {
143
+ "role": "user",
144
+ "images": [img_b64],
145
+ }
146
+ )
147
+ return messages
148
+
149
+ ###############################################
150
+ # Utilities
151
+ ###############################################
152
+
153
+ def convert_to_base64(self, image_source: str, is_local: bool) -> str:
154
+ from PIL import Image
155
+
156
+ settings = get_settings()
157
+ try:
158
+ if not is_local:
159
+ response = requests.get(
160
+ image_source,
161
+ stream=True,
162
+ timeout=(
163
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
164
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
165
+ ),
166
+ )
167
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
168
+ image = Image.open(io.BytesIO(response.content))
169
+ else:
170
+ image = Image.open(image_source)
171
+
172
+ buffered = io.BytesIO()
173
+ image.save(buffered, format="JPEG")
174
+ img_str = base64.b64encode(buffered.getvalue()).decode()
175
+ return img_str
176
+
177
+ except (requests.exceptions.RequestException, OSError) as e:
178
+ # Log, then rethrow so @retry_ollama can retry generate_messages() on network failures
179
+ print(f"Image fetch/encode failed: {e}")
180
+ raise
181
+ except Exception as e:
182
+ print(f"Error converting image to base64: {e}")
183
+ return None
184
+
91
185
  ###############################################
92
186
  # Model
93
187
  ###############################################
94
188
 
95
189
  def load_model(self, async_mode: bool = False):
190
+ ollama = require_dependency(
191
+ "ollama",
192
+ provider_label="OllamaModel",
193
+ install_hint="Install it with `pip install ollama`.",
194
+ )
96
195
  if not async_mode:
97
- return self._build_client(Client)
98
- return self._build_client(AsyncClient)
196
+ return self._build_client(ollama.Client)
197
+ return self._build_client(ollama.AsyncClient)
198
+
199
+ def _client_kwargs(self) -> Dict:
200
+ """Return kwargs forwarded to the underlying Ollama Client/AsyncClient."""
201
+ return dict(self.kwargs or {})
99
202
 
100
203
  def _build_client(self, cls):
101
- return cls(host=self.base_url, **self.kwargs)
204
+ kw = dict(
205
+ host=self.base_url,
206
+ **self._client_kwargs(),
207
+ )
208
+ return cls(**kw)
209
+
210
+ def supports_multimodal(self):
211
+ if self.name in valid_multimodal_models:
212
+ return True
213
+ return False
102
214
 
103
215
  def get_model_name(self):
104
- return f"{self.model_name} (Ollama)"
216
+ return f"{self.name} (Ollama)"
@@ -1,18 +1,23 @@
1
+ import base64
1
2
  from openai.types.chat.chat_completion import ChatCompletion
2
- from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
3
- from typing import Optional, Tuple, Union, Dict
4
- from pydantic import BaseModel
5
-
3
+ from typing import Optional, Tuple, Union, Dict, List
4
+ from deepeval.test_case import MLLMImage
5
+ from pydantic import BaseModel, SecretStr
6
+ from io import BytesIO
6
7
  from openai import (
7
8
  OpenAI,
8
9
  AsyncOpenAI,
9
10
  )
10
-
11
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
11
12
  from deepeval.config.settings import get_settings
12
13
  from deepeval.constants import ProviderSlug as PS
13
14
  from deepeval.models import DeepEvalBaseLLM
14
15
  from deepeval.models.llms.utils import trim_and_load_json
15
- from deepeval.models.utils import parse_model_name
16
+ from deepeval.models.utils import (
17
+ parse_model_name,
18
+ require_secret_api_key,
19
+ normalize_kwargs_and_extract_aliases,
20
+ )
16
21
  from deepeval.models.retry_policy import (
17
22
  create_retry_decorator,
18
23
  sdk_retries_for,
@@ -21,6 +26,7 @@ from deepeval.models.retry_policy import (
21
26
 
22
27
  retry_openai = create_retry_decorator(PS.OPENAI)
23
28
 
29
+
24
30
  valid_gpt_models = [
25
31
  "gpt-3.5-turbo",
26
32
  "gpt-3.5-turbo-0125",
@@ -83,6 +89,15 @@ unsupported_log_probs_gpt_models = [
83
89
  "gpt-5-chat-latest",
84
90
  ]
85
91
 
92
+ unsupported_log_probs_multimodal_gpt_models = [
93
+ "o1",
94
+ "o1-preview",
95
+ "o1-2024-12-17",
96
+ "o1-preview-2024-09-12",
97
+ "gpt-4.5-preview-2025-02-27",
98
+ "o4-mini",
99
+ ]
100
+
86
101
  structured_outputs_models = [
87
102
  "gpt-4o",
88
103
  "gpt-4o-2024-05-13",
@@ -215,77 +230,98 @@ def _request_timeout_seconds() -> float:
215
230
  return timeout if timeout > 0 else 30.0
216
231
 
217
232
 
233
+ _ALIAS_MAP = {
234
+ "api_key": ["_openai_api_key"],
235
+ }
236
+
237
+
218
238
  class GPTModel(DeepEvalBaseLLM):
239
+ valid_multimodal_models = [
240
+ "gpt-4o",
241
+ "gpt-4o-mini",
242
+ "gpt-4.1",
243
+ "gpt-4.1-mini",
244
+ "gpt-5",
245
+ ]
246
+
219
247
  def __init__(
220
248
  self,
221
249
  model: Optional[str] = None,
222
- _openai_api_key: Optional[str] = None,
250
+ api_key: Optional[str] = None,
223
251
  base_url: Optional[str] = None,
252
+ temperature: float = 0,
224
253
  cost_per_input_token: Optional[float] = None,
225
254
  cost_per_output_token: Optional[float] = None,
226
- temperature: float = 0,
227
255
  generation_kwargs: Optional[Dict] = None,
228
256
  **kwargs,
229
257
  ):
230
- model_name = None
231
- model = model or KEY_FILE_HANDLER.fetch_data(
232
- ModelKeyValues.OPENAI_MODEL_NAME
258
+ normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
259
+ "GPTModel",
260
+ kwargs,
261
+ _ALIAS_MAP,
233
262
  )
263
+
264
+ # re-map depricated keywords to re-named positional args
265
+ if api_key is None and "api_key" in alias_values:
266
+ api_key = alias_values["api_key"]
267
+
268
+ settings = get_settings()
269
+ model = model or settings.OPENAI_MODEL_NAME
234
270
  cost_per_input_token = (
235
271
  cost_per_input_token
236
272
  if cost_per_input_token is not None
237
- else KEY_FILE_HANDLER.fetch_data(
238
- ModelKeyValues.OPENAI_COST_PER_INPUT_TOKEN
239
- )
273
+ else settings.OPENAI_COST_PER_INPUT_TOKEN
240
274
  )
241
275
  cost_per_output_token = (
242
276
  cost_per_output_token
243
277
  if cost_per_output_token is not None
244
- else KEY_FILE_HANDLER.fetch_data(
245
- ModelKeyValues.OPENAI_COST_PER_OUTPUT_TOKEN
246
- )
278
+ else settings.OPENAI_COST_PER_OUTPUT_TOKEN
247
279
  )
248
280
 
281
+ if model is None:
282
+ model = default_gpt_model
283
+
249
284
  if isinstance(model, str):
250
- model_name = parse_model_name(model)
251
- if model_name not in valid_gpt_models:
285
+ model = parse_model_name(model)
286
+ if model not in valid_gpt_models:
252
287
  raise ValueError(
253
288
  f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}"
254
289
  )
255
- elif model is None:
256
- model_name = default_gpt_model
257
290
 
258
- if model_name not in model_pricing:
291
+ if model not in model_pricing:
259
292
  if cost_per_input_token is None or cost_per_output_token is None:
260
293
  raise ValueError(
261
- f"No pricing available for `{model_name}`. "
294
+ f"No pricing available for `{model}`. "
262
295
  "Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `GPTModel`, "
263
296
  "or set them via the CLI:\n"
264
297
  " deepeval set-openai --model=[...] --cost_per_input_token=[...] --cost_per_output_token=[...]"
265
298
  )
266
299
  else:
267
- model_pricing[model_name] = {
300
+ model_pricing[model] = {
268
301
  "input": float(cost_per_input_token),
269
302
  "output": float(cost_per_output_token),
270
303
  }
271
304
 
272
- elif model is None:
273
- model_name = default_gpt_model
305
+ if api_key is not None:
306
+ # keep it secret, keep it safe from serializings, logging and alike
307
+ self.api_key: SecretStr | None = SecretStr(api_key)
308
+ else:
309
+ self.api_key = get_settings().OPENAI_API_KEY
274
310
 
275
- self._openai_api_key = _openai_api_key
276
311
  self.base_url = base_url
277
312
  # args and kwargs will be passed to the underlying model, in load_model function
278
313
 
279
314
  # Auto-adjust temperature for models that require it
280
- if model_name in models_requiring_temperature_1:
315
+ if model in models_requiring_temperature_1:
281
316
  temperature = 1
282
317
 
283
318
  if temperature < 0:
284
319
  raise ValueError("Temperature must be >= 0.")
285
320
  self.temperature = temperature
286
- self.kwargs = kwargs
321
+ # Keep sanitized kwargs for client call to strip legacy keys
322
+ self.kwargs = normalized_kwargs
287
323
  self.generation_kwargs = generation_kwargs or {}
288
- super().__init__(model_name)
324
+ super().__init__(model)
289
325
 
290
326
  ###############################################
291
327
  # Generate functions
@@ -296,10 +332,15 @@ class GPTModel(DeepEvalBaseLLM):
296
332
  self, prompt: str, schema: Optional[BaseModel] = None
297
333
  ) -> Tuple[Union[str, Dict], float]:
298
334
  client = self.load_model(async_mode=False)
335
+
336
+ if check_if_multimodal(prompt):
337
+ prompt = convert_to_multi_modal_array(input=prompt)
338
+ prompt = self.generate_prompt(prompt)
339
+
299
340
  if schema:
300
- if self.model_name in structured_outputs_models:
341
+ if self.name in structured_outputs_models:
301
342
  completion = client.beta.chat.completions.parse(
302
- model=self.model_name,
343
+ model=self.name,
303
344
  messages=[
304
345
  {"role": "user", "content": prompt},
305
346
  ],
@@ -315,9 +356,9 @@ class GPTModel(DeepEvalBaseLLM):
315
356
  completion.usage.completion_tokens,
316
357
  )
317
358
  return structured_output, cost
318
- if self.model_name in json_mode_models:
359
+ if self.name in json_mode_models:
319
360
  completion = client.beta.chat.completions.parse(
320
- model=self.model_name,
361
+ model=self.name,
321
362
  messages=[
322
363
  {"role": "user", "content": prompt},
323
364
  ],
@@ -335,7 +376,7 @@ class GPTModel(DeepEvalBaseLLM):
335
376
  return schema.model_validate(json_output), cost
336
377
 
337
378
  completion = client.chat.completions.create(
338
- model=self.model_name,
379
+ model=self.name,
339
380
  messages=[{"role": "user", "content": prompt}],
340
381
  temperature=self.temperature,
341
382
  **self.generation_kwargs,
@@ -355,10 +396,15 @@ class GPTModel(DeepEvalBaseLLM):
355
396
  self, prompt: str, schema: Optional[BaseModel] = None
356
397
  ) -> Tuple[Union[str, BaseModel], float]:
357
398
  client = self.load_model(async_mode=True)
399
+
400
+ if check_if_multimodal(prompt):
401
+ prompt = convert_to_multi_modal_array(input=prompt)
402
+ prompt = self.generate_prompt(prompt)
403
+
358
404
  if schema:
359
- if self.model_name in structured_outputs_models:
405
+ if self.name in structured_outputs_models:
360
406
  completion = await client.beta.chat.completions.parse(
361
- model=self.model_name,
407
+ model=self.name,
362
408
  messages=[
363
409
  {"role": "user", "content": prompt},
364
410
  ],
@@ -374,9 +420,9 @@ class GPTModel(DeepEvalBaseLLM):
374
420
  completion.usage.completion_tokens,
375
421
  )
376
422
  return structured_output, cost
377
- if self.model_name in json_mode_models:
423
+ if self.name in json_mode_models:
378
424
  completion = await client.beta.chat.completions.parse(
379
- model=self.model_name,
425
+ model=self.name,
380
426
  messages=[
381
427
  {"role": "user", "content": prompt},
382
428
  ],
@@ -394,7 +440,7 @@ class GPTModel(DeepEvalBaseLLM):
394
440
  return schema.model_validate(json_output), cost
395
441
 
396
442
  completion = await client.chat.completions.create(
397
- model=self.model_name,
443
+ model=self.name,
398
444
  messages=[{"role": "user", "content": prompt}],
399
445
  temperature=self.temperature,
400
446
  **self.generation_kwargs,
@@ -421,8 +467,11 @@ class GPTModel(DeepEvalBaseLLM):
421
467
  ) -> Tuple[ChatCompletion, float]:
422
468
  # Generate completion
423
469
  client = self.load_model(async_mode=False)
470
+ if check_if_multimodal(prompt):
471
+ prompt = convert_to_multi_modal_array(input=prompt)
472
+ prompt = self.generate_prompt(prompt)
424
473
  completion = client.chat.completions.create(
425
- model=self.model_name,
474
+ model=self.name,
426
475
  messages=[{"role": "user", "content": prompt}],
427
476
  temperature=self.temperature,
428
477
  logprobs=True,
@@ -444,8 +493,11 @@ class GPTModel(DeepEvalBaseLLM):
444
493
  ) -> Tuple[ChatCompletion, float]:
445
494
  # Generate completion
446
495
  client = self.load_model(async_mode=True)
496
+ if check_if_multimodal(prompt):
497
+ prompt = convert_to_multi_modal_array(input=prompt)
498
+ prompt = self.generate_prompt(prompt)
447
499
  completion = await client.chat.completions.create(
448
- model=self.model_name,
500
+ model=self.name,
449
501
  messages=[{"role": "user", "content": prompt}],
450
502
  temperature=self.temperature,
451
503
  logprobs=True,
@@ -464,8 +516,11 @@ class GPTModel(DeepEvalBaseLLM):
464
516
  self, prompt: str, n: int, temperature: float
465
517
  ) -> Tuple[list[str], float]:
466
518
  client = self.load_model(async_mode=False)
519
+ if check_if_multimodal(prompt):
520
+ prompt = convert_to_multi_modal_array(input=prompt)
521
+ prompt = self.generate_prompt(prompt)
467
522
  response = client.chat.completions.create(
468
- model=self.model_name,
523
+ model=self.name,
469
524
  messages=[{"role": "user", "content": prompt}],
470
525
  n=n,
471
526
  temperature=temperature,
@@ -480,17 +535,49 @@ class GPTModel(DeepEvalBaseLLM):
480
535
 
481
536
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
482
537
  # TODO: consider loggin a warning instead of defaulting to whole model pricing
483
- pricing = model_pricing.get(self.model_name, model_pricing)
538
+ pricing = model_pricing.get(self.name, model_pricing)
484
539
  input_cost = input_tokens * pricing["input"]
485
540
  output_cost = output_tokens * pricing["output"]
486
541
  return input_cost + output_cost
487
542
 
488
- ###############################################
489
- # Model
490
- ###############################################
543
+ #########
544
+ # Model #
545
+ #########
491
546
 
492
- def get_model_name(self):
493
- return self.model_name
547
+ def generate_prompt(
548
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
549
+ ):
550
+ prompt = []
551
+ for ele in multimodal_input:
552
+ if isinstance(ele, str):
553
+ prompt.append({"type": "text", "text": ele})
554
+ elif isinstance(ele, MLLMImage):
555
+ if ele.local:
556
+ import PIL.Image
557
+
558
+ image = PIL.Image.open(ele.url)
559
+ visual_dict = {
560
+ "type": "image_url",
561
+ "image_url": {
562
+ "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
563
+ },
564
+ }
565
+ else:
566
+ visual_dict = {
567
+ "type": "image_url",
568
+ "image_url": {"url": ele.url},
569
+ }
570
+ prompt.append(visual_dict)
571
+ return prompt
572
+
573
+ def encode_pil_image(self, pil_image):
574
+ image_buffer = BytesIO()
575
+ if pil_image.mode in ("RGBA", "LA", "P"):
576
+ pil_image = pil_image.convert("RGB")
577
+ pil_image.save(image_buffer, format="JPEG")
578
+ image_bytes = image_buffer.getvalue()
579
+ base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
580
+ return base64_encoded_image
494
581
 
495
582
  def load_model(self, async_mode: bool = False):
496
583
  if not async_mode:
@@ -512,9 +599,15 @@ class GPTModel(DeepEvalBaseLLM):
512
599
  return kwargs
513
600
 
514
601
  def _build_client(self, cls):
602
+ api_key = require_secret_api_key(
603
+ self.api_key,
604
+ provider_label="OpenAI",
605
+ env_var_name="OPENAI_API_KEY",
606
+ param_hint="`api_key` to GPTModel(...)",
607
+ )
515
608
 
516
609
  kw = dict(
517
- api_key=self._openai_api_key,
610
+ api_key=api_key,
518
611
  base_url=self.base_url,
519
612
  **self._client_kwargs(),
520
613
  )
@@ -526,3 +619,11 @@ class GPTModel(DeepEvalBaseLLM):
526
619
  kw.pop("max_retries", None)
527
620
  return cls(**kw)
528
621
  raise
622
+
623
+ def supports_multimodal(self):
624
+ if self.name in GPTModel.valid_multimodal_models:
625
+ return True
626
+ return False
627
+
628
+ def get_model_name(self):
629
+ return f"{self.name}"