deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -9,13 +9,17 @@ from tenacity import (
9
9
  RetryCallState,
10
10
  )
11
11
 
12
+ from deepeval.errors import DeepEvalError
12
13
  from deepeval.config.settings import get_settings
13
14
  from deepeval.models.utils import (
14
15
  require_secret_api_key,
15
16
  normalize_kwargs_and_extract_aliases,
16
17
  )
18
+ from deepeval.test_case import MLLMImage
19
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
17
20
  from deepeval.models import DeepEvalBaseLLM
18
21
  from deepeval.models.llms.utils import trim_and_load_json
22
+ from deepeval.utils import require_param
19
23
 
20
24
 
21
25
  def log_retry_error(retry_state: RetryCallState):
@@ -47,11 +51,11 @@ class LiteLLMModel(DeepEvalBaseLLM):
47
51
  model: Optional[str] = None,
48
52
  api_key: Optional[str] = None,
49
53
  base_url: Optional[str] = None,
50
- temperature: float = 0,
54
+ temperature: Optional[float] = None,
51
55
  generation_kwargs: Optional[Dict] = None,
52
56
  **kwargs,
53
57
  ):
54
-
58
+ settings = get_settings()
55
59
  normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
56
60
  "LiteLLMModel",
57
61
  kwargs,
@@ -62,18 +66,13 @@ class LiteLLMModel(DeepEvalBaseLLM):
62
66
  if base_url is None and "base_url" in alias_values:
63
67
  base_url = alias_values["base_url"]
64
68
 
65
- settings = get_settings()
66
69
  # Get model name from parameter or key file
67
70
  model = model or settings.LITELLM_MODEL_NAME
68
- if not model:
69
- raise ValueError(
70
- "Model name must be provided either through parameter or set-litellm command"
71
- )
72
71
 
73
72
  # Get API key from parameter, or settings
74
73
  if api_key is not None:
75
74
  # keep it secret, keep it safe from serializings, logging and aolike
76
- self.api_key: SecretStr | None = SecretStr(api_key)
75
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
77
76
  else:
78
77
  self.api_key = (
79
78
  settings.LITELLM_API_KEY
@@ -84,7 +83,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
84
83
  )
85
84
 
86
85
  # Get API base from parameter, key file, or environment variable
87
- self.base_url = (
86
+ base_url = (
88
87
  base_url
89
88
  or (
90
89
  str(settings.LITELLM_API_BASE)
@@ -97,13 +96,35 @@ class LiteLLMModel(DeepEvalBaseLLM):
97
96
  else None
98
97
  )
99
98
  )
99
+ self.base_url = (
100
+ str(base_url).rstrip("/") if base_url is not None else None
101
+ )
102
+
103
+ if temperature is not None:
104
+ temperature = float(temperature)
105
+ elif settings.TEMPERATURE is not None:
106
+ temperature = settings.TEMPERATURE
107
+ else:
108
+ temperature = 0.0
109
+
110
+ # validation
111
+ model = require_param(
112
+ model,
113
+ provider_label="LiteLLMModel",
114
+ env_var_name="LITELLM_MODEL_NAME",
115
+ param_hint="model",
116
+ )
100
117
 
101
118
  if temperature < 0:
102
- raise ValueError("Temperature must be >= 0.")
119
+ raise DeepEvalError("Temperature must be >= 0.")
103
120
  self.temperature = temperature
104
121
  # Keep sanitized kwargs for client call to strip legacy keys
105
122
  self.kwargs = normalized_kwargs
106
- self.generation_kwargs = generation_kwargs or {}
123
+ self.kwargs.pop("temperature", None)
124
+
125
+ self.generation_kwargs = dict(generation_kwargs or {})
126
+ self.generation_kwargs.pop("temperature", None)
127
+
107
128
  self.evaluation_cost = 0.0 # Initialize cost to 0.0
108
129
  super().__init__(model)
109
130
 
@@ -117,13 +138,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
117
138
  )
118
139
  def generate(
119
140
  self, prompt: str, schema: Optional[BaseModel] = None
120
- ) -> Union[str, Dict, Tuple[str, float]]:
141
+ ) -> Tuple[Union[str, BaseModel], float]:
121
142
 
122
143
  from litellm import completion
123
144
 
145
+ if check_if_multimodal(prompt):
146
+ prompt = convert_to_multi_modal_array(input=prompt)
147
+ content = self.generate_content(prompt)
148
+ else:
149
+ content = [{"type": "text", "text": prompt}]
150
+
124
151
  completion_params = {
125
152
  "model": self.name,
126
- "messages": [{"role": "user", "content": prompt}],
153
+ "messages": [{"role": "user", "content": content}],
127
154
  "temperature": self.temperature,
128
155
  }
129
156
 
@@ -131,7 +158,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
131
158
  api_key = require_secret_api_key(
132
159
  self.api_key,
133
160
  provider_label="LiteLLM",
134
- env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
161
+ env_var_name="LITELLM_API_KEY|LITELLM_PROXY_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
135
162
  param_hint="`api_key` to LiteLLMModel(...)",
136
163
  )
137
164
  completion_params["api_key"] = api_key
@@ -173,13 +200,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
173
200
  )
174
201
  async def a_generate(
175
202
  self, prompt: str, schema: Optional[BaseModel] = None
176
- ) -> Union[str, Dict, Tuple[str, float]]:
203
+ ) -> Tuple[Union[str, BaseModel], float]:
177
204
 
178
205
  from litellm import acompletion
179
206
 
207
+ if check_if_multimodal(prompt):
208
+ prompt = convert_to_multi_modal_array(input=prompt)
209
+ content = self.generate_content(prompt)
210
+ else:
211
+ content = [{"type": "text", "text": prompt}]
212
+
180
213
  completion_params = {
181
214
  "model": self.name,
182
- "messages": [{"role": "user", "content": prompt}],
215
+ "messages": [{"role": "user", "content": content}],
183
216
  "temperature": self.temperature,
184
217
  }
185
218
 
@@ -241,9 +274,14 @@ class LiteLLMModel(DeepEvalBaseLLM):
241
274
  env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
242
275
  param_hint="`api_key` to LiteLLMModel(...)",
243
276
  )
277
+ if check_if_multimodal(prompt):
278
+ prompt = convert_to_multi_modal_array(input=prompt)
279
+ content = self.generate_content(prompt)
280
+ else:
281
+ content = [{"type": "text", "text": prompt}]
244
282
  completion_params = {
245
283
  "model": self.name,
246
- "messages": [{"role": "user", "content": prompt}],
284
+ "messages": [{"role": "user", "content": content}],
247
285
  "temperature": self.temperature,
248
286
  "api_key": api_key,
249
287
  "api_base": self.base_url,
@@ -251,6 +289,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
251
289
  "top_logprobs": top_logprobs,
252
290
  }
253
291
  completion_params.update(self.kwargs)
292
+ completion_params.update(self.generation_kwargs)
254
293
 
255
294
  response = completion(**completion_params)
256
295
  cost = self.calculate_cost(response)
@@ -282,9 +321,14 @@ class LiteLLMModel(DeepEvalBaseLLM):
282
321
  env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
283
322
  param_hint="`api_key` to LiteLLMModel(...)",
284
323
  )
324
+ if check_if_multimodal(prompt):
325
+ prompt = convert_to_multi_modal_array(input=prompt)
326
+ content = self.generate_content(prompt)
327
+ else:
328
+ content = [{"type": "text", "text": prompt}]
285
329
  completion_params = {
286
330
  "model": self.name,
287
- "messages": [{"role": "user", "content": prompt}],
331
+ "messages": [{"role": "user", "content": content}],
288
332
  "temperature": self.temperature,
289
333
  "api_key": api_key,
290
334
  "api_base": self.base_url,
@@ -292,6 +336,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
292
336
  "top_logprobs": top_logprobs,
293
337
  }
294
338
  completion_params.update(self.kwargs)
339
+ completion_params.update(self.generation_kwargs)
295
340
 
296
341
  response = await acompletion(**completion_params)
297
342
  cost = self.calculate_cost(response)
@@ -340,6 +385,34 @@ class LiteLLMModel(DeepEvalBaseLLM):
340
385
  logging.error(f"Error in LiteLLM generate_samples: {e}")
341
386
  raise
342
387
 
388
+ def generate_content(
389
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
390
+ ):
391
+ content = []
392
+ for element in multimodal_input:
393
+ if isinstance(element, str):
394
+ content.append({"type": "text", "text": element})
395
+ elif isinstance(element, MLLMImage):
396
+ if element.url and not element.local:
397
+ content.append(
398
+ {
399
+ "type": "image_url",
400
+ "image_url": {"url": element.url},
401
+ }
402
+ )
403
+ else:
404
+ element.ensure_images_loaded()
405
+ data_uri = (
406
+ f"data:{element.mimeType};base64,{element.dataBase64}"
407
+ )
408
+ content.append(
409
+ {
410
+ "type": "image_url",
411
+ "image_url": {"url": data_uri},
412
+ }
413
+ )
414
+ return content
415
+
343
416
  def calculate_cost(self, response: Any) -> float:
344
417
  """Calculate the cost of the response based on token usage."""
345
418
  try:
@@ -389,3 +462,6 @@ class LiteLLMModel(DeepEvalBaseLLM):
389
462
  None as LiteLLM handles client creation internally
390
463
  """
391
464
  return None
465
+
466
+ def supports_multimodal(self):
467
+ return True
@@ -1,8 +1,9 @@
1
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
2
2
  from pydantic import BaseModel, SecretStr
3
3
  from openai import OpenAI, AsyncOpenAI
4
4
  from openai.types.chat import ChatCompletion
5
5
 
6
+ from deepeval.errors import DeepEvalError
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.models.retry_policy import (
8
9
  create_retry_decorator,
@@ -14,6 +15,12 @@ from deepeval.models.utils import (
14
15
  )
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.constants import ProviderSlug as PS
18
+ from deepeval.test_case import MLLMImage
19
+ from deepeval.utils import (
20
+ check_if_multimodal,
21
+ convert_to_multi_modal_array,
22
+ require_param,
23
+ )
17
24
 
18
25
 
19
26
  # consistent retry rules
@@ -26,7 +33,7 @@ class LocalModel(DeepEvalBaseLLM):
26
33
  model: Optional[str] = None,
27
34
  api_key: Optional[str] = None,
28
35
  base_url: Optional[str] = None,
29
- temperature: float = 0,
36
+ temperature: Optional[float] = None,
30
37
  format: Optional[str] = None,
31
38
  generation_kwargs: Optional[Dict] = None,
32
39
  **kwargs,
@@ -35,38 +42,64 @@ class LocalModel(DeepEvalBaseLLM):
35
42
 
36
43
  model = model or settings.LOCAL_MODEL_NAME
37
44
  if api_key is not None:
38
- # keep it secret, keep it safe from serializings, logging and alike
39
- self.local_model_api_key: SecretStr | None = SecretStr(api_key)
45
+ self.local_model_api_key: Optional[SecretStr] = SecretStr(api_key)
40
46
  else:
41
47
  self.local_model_api_key = settings.LOCAL_MODEL_API_KEY
42
48
 
49
+ base_url = (
50
+ base_url if base_url is not None else settings.LOCAL_MODEL_BASE_URL
51
+ )
43
52
  self.base_url = (
44
- base_url
45
- or settings.LOCAL_MODEL_BASE_URL
46
- and str(settings.LOCAL_MODEL_BASE_URL)
53
+ str(base_url).rstrip("/") if base_url is not None else None
54
+ )
55
+ self.format = format or settings.LOCAL_MODEL_FORMAT or "json"
56
+
57
+ if temperature is not None:
58
+ temperature = float(temperature)
59
+ elif settings.TEMPERATURE is not None:
60
+ temperature = settings.TEMPERATURE
61
+ else:
62
+ temperature = 0.0
63
+
64
+ # validation
65
+ model = require_param(
66
+ model,
67
+ provider_label="LocalModel",
68
+ env_var_name="LOCAL_MODEL_NAME",
69
+ param_hint="model",
47
70
  )
48
- self.format = format or settings.LOCAL_MODEL_FORMAT
71
+
49
72
  if temperature < 0:
50
- raise ValueError("Temperature must be >= 0.")
73
+ raise DeepEvalError("Temperature must be >= 0.")
51
74
  self.temperature = temperature
52
- # Keep sanitized kwargs for client call to strip legacy keys
75
+
53
76
  self.kwargs = kwargs
54
- self.generation_kwargs = generation_kwargs or {}
77
+ self.kwargs.pop("temperature", None)
78
+
79
+ self.generation_kwargs = dict(generation_kwargs or {})
80
+ self.generation_kwargs.pop("temperature", None)
81
+
55
82
  super().__init__(model)
56
83
 
57
84
  ###############################################
58
- # Other generate functions
85
+ # Generate functions
59
86
  ###############################################
60
87
 
61
88
  @retry_local
62
89
  def generate(
63
90
  self, prompt: str, schema: Optional[BaseModel] = None
64
- ) -> Tuple[Union[str, Dict], float]:
91
+ ) -> Tuple[Union[str, BaseModel], float]:
92
+
93
+ if check_if_multimodal(prompt):
94
+ prompt = convert_to_multi_modal_array(input=prompt)
95
+ content = self.generate_content(prompt)
96
+ else:
97
+ content = prompt
65
98
 
66
99
  client = self.load_model(async_mode=False)
67
100
  response: ChatCompletion = client.chat.completions.create(
68
101
  model=self.name,
69
- messages=[{"role": "user", "content": prompt}],
102
+ messages=[{"role": "user", "content": content}],
70
103
  temperature=self.temperature,
71
104
  **self.generation_kwargs,
72
105
  )
@@ -81,12 +114,18 @@ class LocalModel(DeepEvalBaseLLM):
81
114
  @retry_local
82
115
  async def a_generate(
83
116
  self, prompt: str, schema: Optional[BaseModel] = None
84
- ) -> Tuple[Union[str, Dict], float]:
117
+ ) -> Tuple[Union[str, BaseModel], float]:
118
+
119
+ if check_if_multimodal(prompt):
120
+ prompt = convert_to_multi_modal_array(input=prompt)
121
+ content = self.generate_content(prompt)
122
+ else:
123
+ content = prompt
85
124
 
86
125
  client = self.load_model(async_mode=True)
87
126
  response: ChatCompletion = await client.chat.completions.create(
88
127
  model=self.name,
89
- messages=[{"role": "user", "content": prompt}],
128
+ messages=[{"role": "user", "content": content}],
90
129
  temperature=self.temperature,
91
130
  **self.generation_kwargs,
92
131
  )
@@ -98,6 +137,63 @@ class LocalModel(DeepEvalBaseLLM):
98
137
  else:
99
138
  return res_content, 0.0
100
139
 
140
+ def generate_content(
141
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
142
+ ):
143
+ """
144
+ Converts multimodal input into OpenAI-compatible format.
145
+ Uses data URIs for all images since we can't guarantee local servers support URL fetching.
146
+ """
147
+ prompt = []
148
+ for element in multimodal_input:
149
+ if isinstance(element, str):
150
+ prompt.append({"type": "text", "text": element})
151
+ elif isinstance(element, MLLMImage):
152
+ # For local servers, use data URIs for both remote and local images
153
+ # Most local servers don't support fetching external URLs
154
+ if element.url and not element.local:
155
+ import requests
156
+ import base64
157
+
158
+ settings = get_settings()
159
+ try:
160
+ response = requests.get(
161
+ element.url,
162
+ timeout=(
163
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
164
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
165
+ ),
166
+ )
167
+ response.raise_for_status()
168
+
169
+ # Get mime type from response
170
+ mime_type = response.headers.get(
171
+ "content-type", element.mimeType or "image/jpeg"
172
+ )
173
+
174
+ # Encode to base64
175
+ b64_data = base64.b64encode(response.content).decode(
176
+ "utf-8"
177
+ )
178
+ data_uri = f"data:{mime_type};base64,{b64_data}"
179
+
180
+ except Exception as e:
181
+ raise ValueError(
182
+ f"Failed to fetch remote image {element.url}: {e}"
183
+ )
184
+ else:
185
+ element.ensure_images_loaded()
186
+ mime_type = element.mimeType or "image/jpeg"
187
+ data_uri = f"data:{mime_type};base64,{element.dataBase64}"
188
+
189
+ prompt.append(
190
+ {
191
+ "type": "image_url",
192
+ "image_url": {"url": data_uri},
193
+ }
194
+ )
195
+ return prompt
196
+
101
197
  ###############################################
102
198
  # Model
103
199
  ###############################################
@@ -105,6 +201,9 @@ class LocalModel(DeepEvalBaseLLM):
105
201
  def get_model_name(self):
106
202
  return f"{self.name} (Local Model)"
107
203
 
204
+ def supports_multimodal(self):
205
+ return True
206
+
108
207
  def load_model(self, async_mode: bool = False):
109
208
  if not async_mode:
110
209
  return self._build_client(OpenAI)
@@ -1,11 +1,10 @@
1
1
  from typing import TYPE_CHECKING, Optional, Tuple, Union, Dict, List
2
2
  from pydantic import BaseModel
3
- import requests
4
3
  import base64
5
- import io
6
4
 
5
+ from deepeval.errors import DeepEvalError
7
6
  from deepeval.config.settings import get_settings
8
- from deepeval.utils import require_dependency
7
+ from deepeval.utils import require_dependency, require_param
9
8
  from deepeval.models.retry_policy import (
10
9
  create_retry_decorator,
11
10
  )
@@ -13,17 +12,7 @@ from deepeval.utils import convert_to_multi_modal_array, check_if_multimodal
13
12
  from deepeval.test_case import MLLMImage
14
13
  from deepeval.models import DeepEvalBaseLLM
15
14
  from deepeval.constants import ProviderSlug as PS
16
-
17
- valid_multimodal_models = [
18
- "llava:7b",
19
- "llava:13b",
20
- "llava:34b",
21
- "llama4",
22
- "gemma3",
23
- "qwen3-vl",
24
- "qwen2.5-vl",
25
- # TODO: Add more models later on by looking at their catelogue
26
- ]
15
+ from deepeval.models.llms.constants import OLLAMA_MODELS_DATA
27
16
 
28
17
  if TYPE_CHECKING:
29
18
  from ollama import ChatResponse
@@ -36,26 +25,46 @@ class OllamaModel(DeepEvalBaseLLM):
36
25
  self,
37
26
  model: Optional[str] = None,
38
27
  base_url: Optional[str] = None,
39
- temperature: float = 0,
28
+ temperature: Optional[float] = None,
40
29
  generation_kwargs: Optional[Dict] = None,
41
30
  **kwargs,
42
31
  ):
43
32
  settings = get_settings()
44
- model = model or settings.LOCAL_MODEL_NAME
45
- self.base_url = (
46
- base_url
47
- or (
48
- settings.LOCAL_MODEL_BASE_URL
49
- and str(settings.LOCAL_MODEL_BASE_URL)
50
- )
51
- or "http://localhost:11434"
33
+ model = model or settings.OLLAMA_MODEL_NAME
34
+ self.model_data = OLLAMA_MODELS_DATA.get(model)
35
+
36
+ if base_url is not None:
37
+ self.base_url = str(base_url).rstrip("/")
38
+ elif settings.LOCAL_MODEL_BASE_URL is not None:
39
+ self.base_url = str(settings.LOCAL_MODEL_BASE_URL).rstrip("/")
40
+ else:
41
+ self.base_url = "http://localhost:11434"
42
+
43
+ if temperature is not None:
44
+ temperature = float(temperature)
45
+ elif settings.TEMPERATURE is not None:
46
+ temperature = settings.TEMPERATURE
47
+ else:
48
+ temperature = 0.0
49
+
50
+ # validation
51
+ model = require_param(
52
+ model,
53
+ provider_label="OllamaModel",
54
+ env_var_name="LOCAL_MODEL_NAME",
55
+ param_hint="model",
52
56
  )
57
+
53
58
  if temperature < 0:
54
- raise ValueError("Temperature must be >= 0.")
59
+ raise DeepEvalError("Temperature must be >= 0.")
55
60
  self.temperature = temperature
56
61
  # Keep sanitized kwargs for client call to strip legacy keys
57
62
  self.kwargs = kwargs
58
- self.generation_kwargs = generation_kwargs or {}
63
+ self.kwargs.pop("temperature", None)
64
+
65
+ self.generation_kwargs = dict(generation_kwargs or {})
66
+ self.generation_kwargs.pop("temperature", None)
67
+
59
68
  super().__init__(model)
60
69
 
61
70
  ###############################################
@@ -65,7 +74,7 @@ class OllamaModel(DeepEvalBaseLLM):
65
74
  @retry_ollama
66
75
  def generate(
67
76
  self, prompt: str, schema: Optional[BaseModel] = None
68
- ) -> Tuple[Union[str, Dict], float]:
77
+ ) -> Tuple[Union[str, BaseModel], float]:
69
78
  chat_model = self.load_model()
70
79
 
71
80
  if check_if_multimodal(prompt):
@@ -73,7 +82,6 @@ class OllamaModel(DeepEvalBaseLLM):
73
82
  messages = self.generate_messages(prompt)
74
83
  else:
75
84
  messages = [{"role": "user", "content": prompt}]
76
- print(messages)
77
85
 
78
86
  response: ChatResponse = chat_model.chat(
79
87
  model=self.name,
@@ -96,7 +104,7 @@ class OllamaModel(DeepEvalBaseLLM):
96
104
  @retry_ollama
97
105
  async def a_generate(
98
106
  self, prompt: str, schema: Optional[BaseModel] = None
99
- ) -> Tuple[str, float]:
107
+ ) -> Tuple[Union[str, BaseModel], float]:
100
108
  chat_model = self.load_model(async_mode=True)
101
109
 
102
110
  if check_if_multimodal(prompt):
@@ -127,60 +135,78 @@ class OllamaModel(DeepEvalBaseLLM):
127
135
  self, multimodal_input: List[Union[str, MLLMImage]] = []
128
136
  ):
129
137
  messages = []
130
- for ele in multimodal_input:
131
- if isinstance(ele, str):
138
+
139
+ for element in multimodal_input:
140
+ if isinstance(element, str):
141
+ messages.append(
142
+ {
143
+ "role": "user",
144
+ "content": element,
145
+ }
146
+ )
147
+ elif isinstance(element, MLLMImage):
148
+ if element.url and not element.local:
149
+ import requests
150
+ from PIL import Image
151
+ import io
152
+
153
+ settings = get_settings()
154
+ try:
155
+ response = requests.get(
156
+ element.url,
157
+ stream=True,
158
+ timeout=(
159
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
160
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
161
+ ),
162
+ )
163
+ response.raise_for_status()
164
+
165
+ # Convert to JPEG and encode
166
+ image = Image.open(io.BytesIO(response.content))
167
+ buffered = io.BytesIO()
168
+
169
+ # Convert RGBA/LA/P to RGB for JPEG
170
+ if image.mode in ("RGBA", "LA", "P"):
171
+ image = image.convert("RGB")
172
+
173
+ image.save(buffered, format="JPEG")
174
+ img_b64 = base64.b64encode(buffered.getvalue()).decode()
175
+
176
+ except (requests.exceptions.RequestException, OSError) as e:
177
+ print(f"Image fetch/encode failed: {e}")
178
+ raise
179
+ else:
180
+ element.ensure_images_loaded()
181
+ img_b64 = element.dataBase64
182
+
132
183
  messages.append(
133
184
  {
134
185
  "role": "user",
135
- "content": ele,
186
+ "images": [img_b64],
136
187
  }
137
188
  )
138
- elif isinstance(ele, MLLMImage):
139
- img_b64 = self.convert_to_base64(ele.url, ele.local)
140
- if img_b64 is not None:
141
- messages.append(
142
- {
143
- "role": "user",
144
- "images": [img_b64],
145
- }
146
- )
189
+
147
190
  return messages
148
191
 
149
192
  ###############################################
150
- # Utilities
193
+ # Capabilities
151
194
  ###############################################
152
195
 
153
- def convert_to_base64(self, image_source: str, is_local: bool) -> str:
154
- from PIL import Image
196
+ def supports_log_probs(self) -> Union[bool, None]:
197
+ return self.model_data.supports_log_probs
155
198
 
156
- settings = get_settings()
157
- try:
158
- if not is_local:
159
- response = requests.get(
160
- image_source,
161
- stream=True,
162
- timeout=(
163
- settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
164
- settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
165
- ),
166
- )
167
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
168
- image = Image.open(io.BytesIO(response.content))
169
- else:
170
- image = Image.open(image_source)
171
-
172
- buffered = io.BytesIO()
173
- image.save(buffered, format="JPEG")
174
- img_str = base64.b64encode(buffered.getvalue()).decode()
175
- return img_str
176
-
177
- except (requests.exceptions.RequestException, OSError) as e:
178
- # Log, then rethrow so @retry_ollama can retry generate_messages() on network failures
179
- print(f"Image fetch/encode failed: {e}")
180
- raise
181
- except Exception as e:
182
- print(f"Error converting image to base64: {e}")
183
- return None
199
+ def supports_temperature(self) -> Union[bool, None]:
200
+ return self.model_data.supports_temperature
201
+
202
+ def supports_multimodal(self) -> Union[bool, None]:
203
+ return self.model_data.supports_multimodal
204
+
205
+ def supports_structured_outputs(self) -> Union[bool, None]:
206
+ return self.model_data.supports_structured_outputs
207
+
208
+ def supports_json_mode(self) -> Union[bool, None]:
209
+ return self.model_data.supports_json
184
210
 
185
211
  ###############################################
186
212
  # Model
@@ -207,10 +233,5 @@ class OllamaModel(DeepEvalBaseLLM):
207
233
  )
208
234
  return cls(**kw)
209
235
 
210
- def supports_multimodal(self):
211
- if self.name in valid_multimodal_models:
212
- return True
213
- return False
214
-
215
236
  def get_model_name(self):
216
237
  return f"{self.name} (Ollama)"