deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,284 +0,0 @@
1
- from typing import Optional, List, Union
2
- import requests
3
- from pydantic import BaseModel
4
- from google.genai import types
5
- from google import genai
6
-
7
- from deepeval.models.retry_policy import (
8
- create_retry_decorator,
9
- )
10
- from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
11
- from deepeval.models.base_model import DeepEvalBaseMLLM
12
- from deepeval.test_case import MLLMImage
13
- from deepeval.config.settings import get_settings
14
- from deepeval.constants import ProviderSlug as PS
15
-
16
-
17
- default_multimodal_gemini_model = "gemini-1.5-pro"
18
- # consistent retry rules
19
- retry_gemini = create_retry_decorator(PS.GOOGLE)
20
-
21
-
22
- class MultimodalGeminiModel(DeepEvalBaseMLLM):
23
- """Class that implements Google Gemini models for multimodal evaluation.
24
-
25
- This class provides integration with Google's Gemini models through the Google GenAI SDK,
26
- supporting both text and multimodal (text + image) inputs for evaluation tasks.
27
- To use Gemini API, set api_key attribute only.
28
- To use Vertex AI API, set project and location attributes.
29
-
30
- Attributes:
31
- model_name: Name of the Gemini model to use
32
- api_key: Google API key for authentication
33
- project: Google Cloud project ID
34
- location: Google Cloud location
35
-
36
- Example:
37
- ```python
38
- from deepeval.models import MultimodalGeminiModel
39
-
40
- # Initialize the model
41
- model = MultimodalGeminiModel(
42
- model_name="gemini-pro-vision",
43
- api_key="your-api-key"
44
- )
45
-
46
- # Generate text from text + image input
47
- response = model.generate([
48
- "Describe what you see in this image:",
49
- MLLMImage(url="path/to/image.jpg", local=True)
50
- ])
51
- ```
52
- """
53
-
54
- def __init__(
55
- self,
56
- model_name: Optional[str] = None,
57
- api_key: Optional[str] = None,
58
- project: Optional[str] = None,
59
- location: Optional[str] = None,
60
- *args,
61
- **kwargs,
62
- ):
63
- model_name = (
64
- model_name
65
- or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.GEMINI_MODEL_NAME)
66
- or default_multimodal_gemini_model
67
- )
68
-
69
- # Get API key from key handler if not provided
70
- self.api_key = api_key or KEY_FILE_HANDLER.fetch_data(
71
- ModelKeyValues.GOOGLE_API_KEY
72
- )
73
- self.project = project or KEY_FILE_HANDLER.fetch_data(
74
- ModelKeyValues.GOOGLE_CLOUD_PROJECT
75
- )
76
- self.location = location or KEY_FILE_HANDLER.fetch_data(
77
- ModelKeyValues.GOOGLE_CLOUD_LOCATION
78
- )
79
- self.use_vertexai = KEY_FILE_HANDLER.fetch_data(
80
- ModelKeyValues.GOOGLE_GENAI_USE_VERTEXAI
81
- )
82
-
83
- super().__init__(model_name, *args, **kwargs)
84
- self.model = self.load_model(*args, **kwargs)
85
-
86
- def should_use_vertexai(self):
87
- """Checks if the model should use Vertex AI for generation.
88
-
89
- This is determined first by the value of `GOOGLE_GENAI_USE_VERTEXAI`
90
- environment variable. If not set, it checks for the presence of the
91
- project and location.
92
-
93
- Returns:
94
- True if the model should use Vertex AI, False otherwise
95
- """
96
- if self.use_vertexai is not None:
97
- return self.use_vertexai.lower() == "yes"
98
-
99
- if self.project and self.location:
100
- return True
101
- else:
102
- return False
103
-
104
- def load_model(self, *args, **kwargs):
105
- """Creates a client.
106
- With Gen AI SDK, model is set at inference time, so there is no
107
- model to load and initialize.
108
- This method name is kept for compatibility with other LLMs.
109
-
110
- Returns:
111
- A GenerativeModel instance configured for evaluation.
112
- """
113
- if self.should_use_vertexai():
114
- if not self.project or not self.location:
115
- raise ValueError(
116
- "When using Vertex AI API, both project and location are required."
117
- "Either provide them as arguments or set GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables, "
118
- "or set them in your DeepEval configuration."
119
- )
120
-
121
- # Create client for Vertex AI
122
- self.client = genai.Client(
123
- vertexai=True, project=self.project, location=self.location
124
- )
125
- else:
126
- if not self.api_key:
127
- raise ValueError(
128
- "Google API key is required. Either provide it directly, set GOOGLE_API_KEY environment variable, "
129
- "or set it in your DeepEval configuration."
130
- )
131
-
132
- # Create client for Gemini API
133
- self.client = genai.Client(api_key=self.api_key)
134
-
135
- # Configure default model generation settings
136
- self.model_safety_settings = [
137
- types.SafetySetting(
138
- category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
139
- threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
140
- ),
141
- types.SafetySetting(
142
- category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
143
- threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
144
- ),
145
- types.SafetySetting(
146
- category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
147
- threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
148
- ),
149
- types.SafetySetting(
150
- category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
151
- threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
152
- ),
153
- ]
154
- self.model_temperature = 0.0
155
- return self.client.models
156
-
157
- # TODO: Refactor genete prompt to minimize the work done on retry
158
- @retry_gemini
159
- def generate_prompt(
160
- self, multimodal_input: List[Union[str, MLLMImage]] = []
161
- ) -> List[Union[str, MLLMImage]]:
162
- """Converts DeepEval multimodal input into GenAI SDK compatible format.
163
-
164
- Args:
165
- multimodal_input: List of strings and MLLMImage objects
166
-
167
- Returns:
168
- List of strings and PIL Image objects ready for model input
169
-
170
- Raises:
171
- ValueError: If an invalid input type is provided
172
- """
173
- prompt = []
174
- settings = get_settings()
175
-
176
- for ele in multimodal_input:
177
- if isinstance(ele, str):
178
- prompt.append(ele)
179
- elif isinstance(ele, MLLMImage):
180
- if ele.local:
181
- with open(ele.url, "rb") as f:
182
- image_data = f.read()
183
- else:
184
- response = requests.get(
185
- ele.url,
186
- timeout=(
187
- settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
188
- settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
189
- ),
190
- )
191
- response.raise_for_status()
192
- image_data = response.content
193
-
194
- image_part = types.Part.from_bytes(
195
- data=image_data, mime_type="image/jpeg"
196
- )
197
- prompt.append(image_part)
198
- else:
199
- raise ValueError(f"Invalid input type: {type(ele)}")
200
- return prompt
201
-
202
- @retry_gemini
203
- def generate(
204
- self,
205
- multimodal_input: List[Union[str, MLLMImage]],
206
- schema: Optional[BaseModel] = None,
207
- ) -> str:
208
- """Generates text from multimodal input.
209
-
210
- Args:
211
- multimodal_input: List of strings and MLLMImage objects
212
- schema: Optional Pydantic model for structured output
213
-
214
- Returns:
215
- Generated text response
216
- """
217
- prompt = self.generate_prompt(multimodal_input)
218
-
219
- if schema is not None:
220
- response = self.client.models.generate_content(
221
- model=self.model_name,
222
- contents=prompt,
223
- config=types.GenerateContentConfig(
224
- response_mime_type="application/json",
225
- response_schema=schema,
226
- safety_settings=self.model_safety_settings,
227
- temperature=self.model_temperature,
228
- ),
229
- )
230
- return response.parsed, 0
231
- else:
232
- response = self.client.models.generate_content(
233
- model=self.model_name,
234
- contents=prompt,
235
- config=types.GenerateContentConfig(
236
- safety_settings=self.model_safety_settings,
237
- temperature=self.model_temperature,
238
- ),
239
- )
240
- return response.text, 0
241
-
242
- @retry_gemini
243
- async def a_generate(
244
- self,
245
- multimodal_input: List[Union[str, MLLMImage]],
246
- schema: Optional[BaseModel] = None,
247
- ) -> str:
248
- """Asynchronously generates text from multimodal input.
249
-
250
- Args:
251
- multimodal_input: List of strings and MLLMImage objects
252
- schema: Optional Pydantic model for structured output
253
-
254
- Returns:
255
- Generated text response
256
- """
257
- prompt = self.generate_prompt(multimodal_input)
258
-
259
- if schema is not None:
260
- response = await self.client.aio.models.generate_content(
261
- model=self.model_name,
262
- contents=prompt,
263
- config=types.GenerateContentConfig(
264
- response_mime_type="application/json",
265
- response_schema=schema,
266
- safety_settings=self.model_safety_settings,
267
- temperature=self.model_temperature,
268
- ),
269
- )
270
- return response.parsed, 0
271
- else:
272
- response = await self.client.aio.models.generate_content(
273
- model=self.model_name,
274
- contents=prompt,
275
- config=types.GenerateContentConfig(
276
- safety_settings=self.model_safety_settings,
277
- temperature=self.model_temperature,
278
- ),
279
- )
280
- return response.text, 0
281
-
282
- def get_model_name(self) -> str:
283
- """Returns the name of the Gemini model being used."""
284
- return self.model_name
@@ -1,144 +0,0 @@
1
- from typing import Optional, Tuple, List, Union, Dict
2
- from ollama import Client, AsyncClient, ChatResponse
3
- from pydantic import BaseModel
4
- import requests
5
- import base64
6
- import io
7
-
8
- from deepeval.models.retry_policy import (
9
- create_retry_decorator,
10
- )
11
- from deepeval.key_handler import KEY_FILE_HANDLER, ModelKeyValues
12
- from deepeval.models import DeepEvalBaseMLLM
13
- from deepeval.test_case import MLLMImage
14
- from deepeval.config.settings import get_settings
15
- from deepeval.constants import ProviderSlug as PS
16
-
17
-
18
- retry_ollama = create_retry_decorator(PS.OLLAMA)
19
-
20
-
21
- class MultimodalOllamaModel(DeepEvalBaseMLLM):
22
- def __init__(self, **kwargs):
23
- model_name = KEY_FILE_HANDLER.fetch_data(
24
- ModelKeyValues.LOCAL_MODEL_NAME
25
- )
26
- self.base_url = KEY_FILE_HANDLER.fetch_data(
27
- ModelKeyValues.LOCAL_MODEL_BASE_URL
28
- )
29
- self.kwargs = kwargs
30
- super().__init__(model_name)
31
-
32
- @retry_ollama
33
- def generate(
34
- self,
35
- multimodal_input: List[Union[str, MLLMImage]],
36
- schema: Optional[BaseModel] = None,
37
- ) -> Tuple[Union[str, Dict], float]:
38
- chat_model = self.load_model()
39
- messages = self.generate_messages(multimodal_input)
40
- response: ChatResponse = chat_model.chat(
41
- model=self.model_name,
42
- messages=messages,
43
- format=schema.model_json_schema() if schema else None,
44
- )
45
- return (
46
- (
47
- schema.model_validate_json(response.message.content)
48
- if schema
49
- else response.message.content
50
- ),
51
- 0,
52
- )
53
-
54
- @retry_ollama
55
- async def a_generate(
56
- self,
57
- multimodal_input: List[Union[str, MLLMImage]],
58
- schema: Optional[BaseModel] = None,
59
- ) -> Tuple[str, float]:
60
- chat_model = self.load_model(async_mode=True)
61
- messages = self.generate_messages(multimodal_input)
62
- response: ChatResponse = await chat_model.chat(
63
- model=self.model_name,
64
- messages=messages,
65
- format=schema.model_json_schema() if schema else None,
66
- )
67
- return (
68
- (
69
- schema.model_validate_json(response.message.content)
70
- if schema
71
- else response.message.content
72
- ),
73
- 0,
74
- )
75
-
76
- def generate_messages(
77
- self, multimodal_input: List[Union[str, MLLMImage]] = []
78
- ):
79
- messages = []
80
- for ele in multimodal_input:
81
- if isinstance(ele, str):
82
- messages.append(
83
- {
84
- "role": "user",
85
- "content": ele,
86
- }
87
- )
88
- elif isinstance(ele, MLLMImage):
89
- img_b64 = self.convert_to_base64(ele.url, ele.local)
90
- if img_b64 is not None:
91
- messages.append(
92
- {
93
- "role": "user",
94
- "images": [img_b64],
95
- }
96
- )
97
- return messages
98
-
99
- ###############################################
100
- # Utilities
101
- ###############################################
102
-
103
- def convert_to_base64(self, image_source: str, is_local: bool) -> str:
104
- from PIL import Image
105
-
106
- settings = get_settings()
107
- try:
108
- if not is_local:
109
- response = requests.get(
110
- image_source,
111
- stream=True,
112
- timeout=(
113
- settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
114
- settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
115
- ),
116
- )
117
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
118
- image = Image.open(io.BytesIO(response.content))
119
- else:
120
- image = Image.open(image_source)
121
-
122
- buffered = io.BytesIO()
123
- image.save(buffered, format="JPEG")
124
- img_str = base64.b64encode(buffered.getvalue()).decode()
125
- return img_str
126
-
127
- except (requests.exceptions.RequestException, OSError) as e:
128
- # Log, then rethrow so @retry_ollama can retry generate_messages() on network failures
129
- print(f"Image fetch/encode failed: {e}")
130
- raise
131
- except Exception as e:
132
- print(f"Error converting image to base64: {e}")
133
- return None
134
-
135
- def load_model(self, async_mode: bool = False):
136
- if not async_mode:
137
- return self._build_client(Client)
138
- return self._build_client(AsyncClient)
139
-
140
- def _build_client(self, cls):
141
- return cls(host=self.base_url, **self.kwargs)
142
-
143
- def get_model_name(self):
144
- return f"{self.model_name} (Ollama)"
@@ -1,258 +0,0 @@
1
- from typing import Optional, Tuple, List, Union
2
- from openai import OpenAI, AsyncOpenAI
3
- from openai.types.chat import ParsedChatCompletion
4
- from pydantic import BaseModel
5
- from io import BytesIO
6
- import base64
7
-
8
- from deepeval.models.llms.openai_model import (
9
- model_pricing,
10
- structured_outputs_models,
11
- )
12
- from deepeval.models import DeepEvalBaseMLLM
13
- from deepeval.models.llms.utils import trim_and_load_json
14
- from deepeval.test_case import MLLMImage
15
- from deepeval.models.utils import parse_model_name
16
- from deepeval.models.retry_policy import (
17
- create_retry_decorator,
18
- sdk_retries_for,
19
- )
20
- from deepeval.constants import ProviderSlug as PS
21
-
22
-
23
- retry_openai = create_retry_decorator(PS.OPENAI)
24
-
25
- valid_multimodal_gpt_models = [
26
- "gpt-4o",
27
- "gpt-4o-2024-05-13",
28
- "gpt-4o-2024-08-06",
29
- "gpt-4o-2024-11-20",
30
- "gpt-4o-mini",
31
- "gpt-4o-mini-2024-07-18",
32
- "gpt-4.1",
33
- "gpt-4.1-mini",
34
- "gpt-4.1-nano",
35
- "o1",
36
- "o1-preview",
37
- "o1-2024-12-17",
38
- "o1-preview-2024-09-12",
39
- "gpt-4.5-preview-2025-02-27",
40
- "o4-mini",
41
- ]
42
-
43
- default_multimodal_gpt_model = "gpt-4.1"
44
-
45
- unsupported_log_probs_multimodal_gpt_models = [
46
- "o1",
47
- "o1-preview",
48
- "o1-2024-12-17",
49
- "o1-preview-2024-09-12",
50
- "gpt-4.5-preview-2025-02-27",
51
- "o4-mini",
52
- ]
53
-
54
-
55
- class MultimodalOpenAIModel(DeepEvalBaseMLLM):
56
- def __init__(
57
- self,
58
- model: Optional[str] = None,
59
- _openai_api_key: Optional[str] = None,
60
- *args,
61
- **kwargs,
62
- ):
63
- model_name = None
64
- if isinstance(model, str):
65
- model_name = parse_model_name(model)
66
- if model_name not in valid_multimodal_gpt_models:
67
- raise ValueError(
68
- f"Invalid model. Available Multimodal GPT models: {', '.join(model for model in valid_multimodal_gpt_models)}"
69
- )
70
- elif model is None:
71
- model_name = default_multimodal_gpt_model
72
-
73
- self._openai_api_key = _openai_api_key
74
- self.args = args
75
- self.kwargs = kwargs
76
-
77
- super().__init__(model_name, *args, **kwargs)
78
-
79
- ###############################################
80
- # Generate functions
81
- ###############################################
82
-
83
- @retry_openai
84
- def generate(
85
- self,
86
- multimodal_input: List[Union[str, MLLMImage]],
87
- schema: Optional[BaseModel] = None,
88
- ) -> Tuple[str, float]:
89
- client = OpenAI(api_key=self._openai_api_key)
90
- prompt = self.generate_prompt(multimodal_input)
91
-
92
- if schema:
93
- if self.model_name in structured_outputs_models:
94
- messages = [{"role": "user", "content": prompt}]
95
- response = client.beta.chat.completions.parse(
96
- model=self.model_name,
97
- messages=messages,
98
- response_format=schema,
99
- )
100
- input_tokens = response.usage.prompt_tokens
101
- output_tokens = response.usage.completion_tokens
102
- total_cost = self.calculate_cost(input_tokens, output_tokens)
103
- generated_text = response.choices[0].message.parsed
104
- return generated_text, total_cost
105
-
106
- completion = client.chat.completions.create(
107
- model=self.model_name,
108
- messages=[{"role": "user", "content": prompt}],
109
- )
110
- output = completion.choices[0].message.content
111
- cost = self.calculate_cost(
112
- completion.usage.prompt_tokens, completion.usage.completion_tokens
113
- )
114
- if schema:
115
- json_output = trim_and_load_json(output)
116
- return schema.model_validate(json_output), cost
117
- else:
118
- return output, cost
119
-
120
- @retry_openai
121
- async def a_generate(
122
- self,
123
- multimodal_input: List[Union[str, MLLMImage]],
124
- schema: Optional[BaseModel] = None,
125
- ) -> Tuple[str, float]:
126
- client = AsyncOpenAI(api_key=self._openai_api_key)
127
- prompt = self.generate_prompt(multimodal_input)
128
-
129
- if schema:
130
- if self.model_name in structured_outputs_models:
131
- messages = [{"role": "user", "content": prompt}]
132
- response = await client.beta.chat.completions.parse(
133
- model=self.model_name,
134
- messages=messages,
135
- response_format=schema,
136
- )
137
- input_tokens = response.usage.prompt_tokens
138
- output_tokens = response.usage.completion_tokens
139
- total_cost = self.calculate_cost(input_tokens, output_tokens)
140
- generated_text = response.choices[0].message.parsed
141
- return generated_text, total_cost
142
-
143
- completion = await client.chat.completions.create(
144
- model=self.model_name,
145
- messages=[{"role": "user", "content": prompt}],
146
- )
147
- output = completion.choices[0].message.content
148
- cost = self.calculate_cost(
149
- completion.usage.prompt_tokens, completion.usage.completion_tokens
150
- )
151
- if schema:
152
- json_output = trim_and_load_json(output)
153
- return schema.model_validate(json_output), cost
154
- else:
155
- return output, cost
156
-
157
- ###############################################
158
- # Other generate functions
159
- ###############################################
160
-
161
- @retry_openai
162
- def generate_raw_response(
163
- self,
164
- multimodal_input: List[Union[str, MLLMImage]],
165
- top_logprobs: int = 5,
166
- ) -> Tuple[ParsedChatCompletion, float]:
167
- client = self._client()
168
- prompt = self.generate_prompt(multimodal_input)
169
- messages = [{"role": "user", "content": prompt}]
170
- completion = client.chat.completions.create(
171
- model=self.model_name,
172
- messages=messages,
173
- logprobs=True,
174
- top_logprobs=top_logprobs,
175
- )
176
- # Cost calculation
177
- input_tokens = completion.usage.prompt_tokens
178
- output_tokens = completion.usage.completion_tokens
179
- cost = self.calculate_cost(input_tokens, output_tokens)
180
- return completion, cost
181
-
182
- @retry_openai
183
- async def a_generate_raw_response(
184
- self,
185
- multimodal_input: List[Union[str, MLLMImage]],
186
- top_logprobs: int = 5,
187
- ) -> Tuple[ParsedChatCompletion, float]:
188
- client = self._client(async_mode=True)
189
- prompt = self.generate_prompt(multimodal_input)
190
- messages = [{"role": "user", "content": prompt}]
191
- completion = await client.chat.completions.create(
192
- model=self.model_name,
193
- messages=messages,
194
- logprobs=True,
195
- top_logprobs=top_logprobs,
196
- )
197
- # Cost calculation
198
- input_tokens = completion.usage.prompt_tokens
199
- output_tokens = completion.usage.completion_tokens
200
- cost = self.calculate_cost(input_tokens, output_tokens)
201
- return completion, cost
202
-
203
- ###############################################
204
- # Utilities
205
- ###############################################
206
-
207
- def generate_prompt(
208
- self, multimodal_input: List[Union[str, MLLMImage]] = []
209
- ):
210
- prompt = []
211
- for ele in multimodal_input:
212
- if isinstance(ele, str):
213
- prompt.append({"type": "text", "text": ele})
214
- elif isinstance(ele, MLLMImage):
215
- if ele.local:
216
- import PIL.Image
217
-
218
- image = PIL.Image.open(ele.url)
219
- visual_dict = {
220
- "type": "image_url",
221
- "image_url": {
222
- "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
223
- },
224
- }
225
- else:
226
- visual_dict = {
227
- "type": "image_url",
228
- "image_url": {"url": ele.url},
229
- }
230
- prompt.append(visual_dict)
231
- return prompt
232
-
233
- def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
234
- pricing = model_pricing.get(
235
- self.model_name, model_pricing["gpt-4.1"]
236
- ) # Default to 'gpt-4.1' if model not found
237
- input_cost = input_tokens * pricing["input"]
238
- output_cost = output_tokens * pricing["output"]
239
- return input_cost + output_cost
240
-
241
- def encode_pil_image(self, pil_image):
242
- image_buffer = BytesIO()
243
- if pil_image.mode in ("RGBA", "LA", "P"):
244
- pil_image = pil_image.convert("RGB")
245
- pil_image.save(image_buffer, format="JPEG")
246
- image_bytes = image_buffer.getvalue()
247
- base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
248
- return base64_encoded_image
249
-
250
- def _client(self, async_mode: bool = False):
251
- kw = {"api_key": self._openai_api_key}
252
- if not sdk_retries_for(PS.OPENAI):
253
- kw["max_retries"] = 0
254
- Client = AsyncOpenAI if async_mode else OpenAI
255
- return Client(**kw)
256
-
257
- def get_model_name(self):
258
- return self.model_name