deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -9,13 +9,17 @@ from tenacity import (
9
9
  RetryCallState,
10
10
  )
11
11
 
12
+ from deepeval.errors import DeepEvalError
12
13
  from deepeval.config.settings import get_settings
13
14
  from deepeval.models.utils import (
14
15
  require_secret_api_key,
15
16
  normalize_kwargs_and_extract_aliases,
16
17
  )
18
+ from deepeval.test_case import MLLMImage
19
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
17
20
  from deepeval.models import DeepEvalBaseLLM
18
21
  from deepeval.models.llms.utils import trim_and_load_json
22
+ from deepeval.utils import require_param
19
23
 
20
24
 
21
25
  def log_retry_error(retry_state: RetryCallState):
@@ -47,11 +51,11 @@ class LiteLLMModel(DeepEvalBaseLLM):
47
51
  model: Optional[str] = None,
48
52
  api_key: Optional[str] = None,
49
53
  base_url: Optional[str] = None,
50
- temperature: float = 0,
54
+ temperature: Optional[float] = None,
51
55
  generation_kwargs: Optional[Dict] = None,
52
56
  **kwargs,
53
57
  ):
54
-
58
+ settings = get_settings()
55
59
  normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
56
60
  "LiteLLMModel",
57
61
  kwargs,
@@ -62,18 +66,13 @@ class LiteLLMModel(DeepEvalBaseLLM):
62
66
  if base_url is None and "base_url" in alias_values:
63
67
  base_url = alias_values["base_url"]
64
68
 
65
- settings = get_settings()
66
69
  # Get model name from parameter or key file
67
70
  model = model or settings.LITELLM_MODEL_NAME
68
- if not model:
69
- raise ValueError(
70
- "Model name must be provided either through parameter or set-litellm command"
71
- )
72
71
 
73
72
  # Get API key from parameter, or settings
74
73
  if api_key is not None:
75
74
  # keep it secret, keep it safe from serializings, logging and aolike
76
- self.api_key: SecretStr | None = SecretStr(api_key)
75
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
77
76
  else:
78
77
  self.api_key = (
79
78
  settings.LITELLM_API_KEY
@@ -84,7 +83,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
84
83
  )
85
84
 
86
85
  # Get API base from parameter, key file, or environment variable
87
- self.base_url = (
86
+ base_url = (
88
87
  base_url
89
88
  or (
90
89
  str(settings.LITELLM_API_BASE)
@@ -97,13 +96,35 @@ class LiteLLMModel(DeepEvalBaseLLM):
97
96
  else None
98
97
  )
99
98
  )
99
+ self.base_url = (
100
+ str(base_url).rstrip("/") if base_url is not None else None
101
+ )
102
+
103
+ if temperature is not None:
104
+ temperature = float(temperature)
105
+ elif settings.TEMPERATURE is not None:
106
+ temperature = settings.TEMPERATURE
107
+ else:
108
+ temperature = 0.0
109
+
110
+ # validation
111
+ model = require_param(
112
+ model,
113
+ provider_label="LiteLLMModel",
114
+ env_var_name="LITELLM_MODEL_NAME",
115
+ param_hint="model",
116
+ )
100
117
 
101
118
  if temperature < 0:
102
- raise ValueError("Temperature must be >= 0.")
119
+ raise DeepEvalError("Temperature must be >= 0.")
103
120
  self.temperature = temperature
104
121
  # Keep sanitized kwargs for client call to strip legacy keys
105
122
  self.kwargs = normalized_kwargs
106
- self.generation_kwargs = generation_kwargs or {}
123
+ self.kwargs.pop("temperature", None)
124
+
125
+ self.generation_kwargs = dict(generation_kwargs or {})
126
+ self.generation_kwargs.pop("temperature", None)
127
+
107
128
  self.evaluation_cost = 0.0 # Initialize cost to 0.0
108
129
  super().__init__(model)
109
130
 
@@ -117,13 +138,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
117
138
  )
118
139
  def generate(
119
140
  self, prompt: str, schema: Optional[BaseModel] = None
120
- ) -> Union[str, Dict, Tuple[str, float]]:
141
+ ) -> Tuple[Union[str, BaseModel], float]:
121
142
 
122
143
  from litellm import completion
123
144
 
145
+ if check_if_multimodal(prompt):
146
+ prompt = convert_to_multi_modal_array(input=prompt)
147
+ content = self.generate_content(prompt)
148
+ else:
149
+ content = [{"type": "text", "text": prompt}]
150
+
124
151
  completion_params = {
125
152
  "model": self.name,
126
- "messages": [{"role": "user", "content": prompt}],
153
+ "messages": [{"role": "user", "content": content}],
127
154
  "temperature": self.temperature,
128
155
  }
129
156
 
@@ -131,7 +158,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
131
158
  api_key = require_secret_api_key(
132
159
  self.api_key,
133
160
  provider_label="LiteLLM",
134
- env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
161
+ env_var_name="LITELLM_API_KEY|LITELLM_PROXY_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
135
162
  param_hint="`api_key` to LiteLLMModel(...)",
136
163
  )
137
164
  completion_params["api_key"] = api_key
@@ -173,13 +200,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
173
200
  )
174
201
  async def a_generate(
175
202
  self, prompt: str, schema: Optional[BaseModel] = None
176
- ) -> Union[str, Dict, Tuple[str, float]]:
203
+ ) -> Tuple[Union[str, BaseModel], float]:
177
204
 
178
205
  from litellm import acompletion
179
206
 
207
+ if check_if_multimodal(prompt):
208
+ prompt = convert_to_multi_modal_array(input=prompt)
209
+ content = self.generate_content(prompt)
210
+ else:
211
+ content = [{"type": "text", "text": prompt}]
212
+
180
213
  completion_params = {
181
214
  "model": self.name,
182
- "messages": [{"role": "user", "content": prompt}],
215
+ "messages": [{"role": "user", "content": content}],
183
216
  "temperature": self.temperature,
184
217
  }
185
218
 
@@ -241,9 +274,14 @@ class LiteLLMModel(DeepEvalBaseLLM):
241
274
  env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
242
275
  param_hint="`api_key` to LiteLLMModel(...)",
243
276
  )
277
+ if check_if_multimodal(prompt):
278
+ prompt = convert_to_multi_modal_array(input=prompt)
279
+ content = self.generate_content(prompt)
280
+ else:
281
+ content = [{"type": "text", "text": prompt}]
244
282
  completion_params = {
245
283
  "model": self.name,
246
- "messages": [{"role": "user", "content": prompt}],
284
+ "messages": [{"role": "user", "content": content}],
247
285
  "temperature": self.temperature,
248
286
  "api_key": api_key,
249
287
  "api_base": self.base_url,
@@ -282,9 +320,14 @@ class LiteLLMModel(DeepEvalBaseLLM):
282
320
  env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
283
321
  param_hint="`api_key` to LiteLLMModel(...)",
284
322
  )
323
+ if check_if_multimodal(prompt):
324
+ prompt = convert_to_multi_modal_array(input=prompt)
325
+ content = self.generate_content(prompt)
326
+ else:
327
+ content = [{"type": "text", "text": prompt}]
285
328
  completion_params = {
286
329
  "model": self.name,
287
- "messages": [{"role": "user", "content": prompt}],
330
+ "messages": [{"role": "user", "content": content}],
288
331
  "temperature": self.temperature,
289
332
  "api_key": api_key,
290
333
  "api_base": self.base_url,
@@ -340,6 +383,34 @@ class LiteLLMModel(DeepEvalBaseLLM):
340
383
  logging.error(f"Error in LiteLLM generate_samples: {e}")
341
384
  raise
342
385
 
386
+ def generate_content(
387
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
388
+ ):
389
+ content = []
390
+ for element in multimodal_input:
391
+ if isinstance(element, str):
392
+ content.append({"type": "text", "text": element})
393
+ elif isinstance(element, MLLMImage):
394
+ if element.url and not element.local:
395
+ content.append(
396
+ {
397
+ "type": "image_url",
398
+ "image_url": {"url": element.url},
399
+ }
400
+ )
401
+ else:
402
+ element.ensure_images_loaded()
403
+ data_uri = (
404
+ f"data:{element.mimeType};base64,{element.dataBase64}"
405
+ )
406
+ content.append(
407
+ {
408
+ "type": "image_url",
409
+ "image_url": {"url": data_uri},
410
+ }
411
+ )
412
+ return content
413
+
343
414
  def calculate_cost(self, response: Any) -> float:
344
415
  """Calculate the cost of the response based on token usage."""
345
416
  try:
@@ -389,3 +460,6 @@ class LiteLLMModel(DeepEvalBaseLLM):
389
460
  None as LiteLLM handles client creation internally
390
461
  """
391
462
  return None
463
+
464
+ def supports_multimodal(self):
465
+ return True
@@ -1,8 +1,9 @@
1
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
2
2
  from pydantic import BaseModel, SecretStr
3
3
  from openai import OpenAI, AsyncOpenAI
4
4
  from openai.types.chat import ChatCompletion
5
5
 
6
+ from deepeval.errors import DeepEvalError
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.models.retry_policy import (
8
9
  create_retry_decorator,
@@ -14,6 +15,12 @@ from deepeval.models.utils import (
14
15
  )
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.constants import ProviderSlug as PS
18
+ from deepeval.test_case import MLLMImage
19
+ from deepeval.utils import (
20
+ check_if_multimodal,
21
+ convert_to_multi_modal_array,
22
+ require_param,
23
+ )
17
24
 
18
25
 
19
26
  # consistent retry rules
@@ -26,7 +33,7 @@ class LocalModel(DeepEvalBaseLLM):
26
33
  model: Optional[str] = None,
27
34
  api_key: Optional[str] = None,
28
35
  base_url: Optional[str] = None,
29
- temperature: float = 0,
36
+ temperature: Optional[float] = None,
30
37
  format: Optional[str] = None,
31
38
  generation_kwargs: Optional[Dict] = None,
32
39
  **kwargs,
@@ -35,38 +42,64 @@ class LocalModel(DeepEvalBaseLLM):
35
42
 
36
43
  model = model or settings.LOCAL_MODEL_NAME
37
44
  if api_key is not None:
38
- # keep it secret, keep it safe from serializings, logging and alike
39
- self.local_model_api_key: SecretStr | None = SecretStr(api_key)
45
+ self.local_model_api_key: Optional[SecretStr] = SecretStr(api_key)
40
46
  else:
41
47
  self.local_model_api_key = settings.LOCAL_MODEL_API_KEY
42
48
 
49
+ base_url = (
50
+ base_url if base_url is not None else settings.LOCAL_MODEL_BASE_URL
51
+ )
43
52
  self.base_url = (
44
- base_url
45
- or settings.LOCAL_MODEL_BASE_URL
46
- and str(settings.LOCAL_MODEL_BASE_URL)
53
+ str(base_url).rstrip("/") if base_url is not None else None
47
54
  )
48
55
  self.format = format or settings.LOCAL_MODEL_FORMAT
56
+
57
+ if temperature is not None:
58
+ temperature = float(temperature)
59
+ elif settings.TEMPERATURE is not None:
60
+ temperature = settings.TEMPERATURE
61
+ else:
62
+ temperature = 0.0
63
+
64
+ # validation
65
+ model = require_param(
66
+ model,
67
+ provider_label="LocalModel",
68
+ env_var_name="LOCAL_MODEL_NAME",
69
+ param_hint="model",
70
+ )
71
+
49
72
  if temperature < 0:
50
- raise ValueError("Temperature must be >= 0.")
73
+ raise DeepEvalError("Temperature must be >= 0.")
51
74
  self.temperature = temperature
52
- # Keep sanitized kwargs for client call to strip legacy keys
75
+
53
76
  self.kwargs = kwargs
54
- self.generation_kwargs = generation_kwargs or {}
77
+ self.kwargs.pop("temperature", None)
78
+
79
+ self.generation_kwargs = dict(generation_kwargs or {})
80
+ self.generation_kwargs.pop("temperature", None)
81
+
55
82
  super().__init__(model)
56
83
 
57
84
  ###############################################
58
- # Other generate functions
85
+ # Generate functions
59
86
  ###############################################
60
87
 
61
88
  @retry_local
62
89
  def generate(
63
90
  self, prompt: str, schema: Optional[BaseModel] = None
64
- ) -> Tuple[Union[str, Dict], float]:
91
+ ) -> Tuple[Union[str, BaseModel], float]:
92
+
93
+ if check_if_multimodal(prompt):
94
+ prompt = convert_to_multi_modal_array(input=prompt)
95
+ content = self.generate_content(prompt)
96
+ else:
97
+ content = prompt
65
98
 
66
99
  client = self.load_model(async_mode=False)
67
100
  response: ChatCompletion = client.chat.completions.create(
68
101
  model=self.name,
69
- messages=[{"role": "user", "content": prompt}],
102
+ messages=[{"role": "user", "content": content}],
70
103
  temperature=self.temperature,
71
104
  **self.generation_kwargs,
72
105
  )
@@ -81,12 +114,18 @@ class LocalModel(DeepEvalBaseLLM):
81
114
  @retry_local
82
115
  async def a_generate(
83
116
  self, prompt: str, schema: Optional[BaseModel] = None
84
- ) -> Tuple[Union[str, Dict], float]:
117
+ ) -> Tuple[Union[str, BaseModel], float]:
118
+
119
+ if check_if_multimodal(prompt):
120
+ prompt = convert_to_multi_modal_array(input=prompt)
121
+ content = self.generate_content(prompt)
122
+ else:
123
+ content = prompt
85
124
 
86
125
  client = self.load_model(async_mode=True)
87
126
  response: ChatCompletion = await client.chat.completions.create(
88
127
  model=self.name,
89
- messages=[{"role": "user", "content": prompt}],
128
+ messages=[{"role": "user", "content": content}],
90
129
  temperature=self.temperature,
91
130
  **self.generation_kwargs,
92
131
  )
@@ -98,6 +137,63 @@ class LocalModel(DeepEvalBaseLLM):
98
137
  else:
99
138
  return res_content, 0.0
100
139
 
140
+ def generate_content(
141
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
142
+ ):
143
+ """
144
+ Converts multimodal input into OpenAI-compatible format.
145
+ Uses data URIs for all images since we can't guarantee local servers support URL fetching.
146
+ """
147
+ prompt = []
148
+ for element in multimodal_input:
149
+ if isinstance(element, str):
150
+ prompt.append({"type": "text", "text": element})
151
+ elif isinstance(element, MLLMImage):
152
+ # For local servers, use data URIs for both remote and local images
153
+ # Most local servers don't support fetching external URLs
154
+ if element.url and not element.local:
155
+ import requests
156
+ import base64
157
+
158
+ settings = get_settings()
159
+ try:
160
+ response = requests.get(
161
+ element.url,
162
+ timeout=(
163
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
164
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
165
+ ),
166
+ )
167
+ response.raise_for_status()
168
+
169
+ # Get mime type from response
170
+ mime_type = response.headers.get(
171
+ "content-type", element.mimeType or "image/jpeg"
172
+ )
173
+
174
+ # Encode to base64
175
+ b64_data = base64.b64encode(response.content).decode(
176
+ "utf-8"
177
+ )
178
+ data_uri = f"data:{mime_type};base64,{b64_data}"
179
+
180
+ except Exception as e:
181
+ raise ValueError(
182
+ f"Failed to fetch remote image {element.url}: {e}"
183
+ )
184
+ else:
185
+ element.ensure_images_loaded()
186
+ mime_type = element.mimeType or "image/jpeg"
187
+ data_uri = f"data:{mime_type};base64,{element.dataBase64}"
188
+
189
+ prompt.append(
190
+ {
191
+ "type": "image_url",
192
+ "image_url": {"url": data_uri},
193
+ }
194
+ )
195
+ return prompt
196
+
101
197
  ###############################################
102
198
  # Model
103
199
  ###############################################
@@ -105,6 +201,9 @@ class LocalModel(DeepEvalBaseLLM):
105
201
  def get_model_name(self):
106
202
  return f"{self.name} (Local Model)"
107
203
 
204
+ def supports_multimodal(self):
205
+ return True
206
+
108
207
  def load_model(self, async_mode: bool = False):
109
208
  if not async_mode:
110
209
  return self._build_client(OpenAI)
@@ -1,11 +1,10 @@
1
1
  from typing import TYPE_CHECKING, Optional, Tuple, Union, Dict, List
2
2
  from pydantic import BaseModel
3
- import requests
4
3
  import base64
5
- import io
6
4
 
5
+ from deepeval.errors import DeepEvalError
7
6
  from deepeval.config.settings import get_settings
8
- from deepeval.utils import require_dependency
7
+ from deepeval.utils import require_dependency, require_param
9
8
  from deepeval.models.retry_policy import (
10
9
  create_retry_decorator,
11
10
  )
@@ -13,17 +12,7 @@ from deepeval.utils import convert_to_multi_modal_array, check_if_multimodal
13
12
  from deepeval.test_case import MLLMImage
14
13
  from deepeval.models import DeepEvalBaseLLM
15
14
  from deepeval.constants import ProviderSlug as PS
16
-
17
- valid_multimodal_models = [
18
- "llava:7b",
19
- "llava:13b",
20
- "llava:34b",
21
- "llama4",
22
- "gemma3",
23
- "qwen3-vl",
24
- "qwen2.5-vl",
25
- # TODO: Add more models later on by looking at their catelogue
26
- ]
15
+ from deepeval.models.llms.constants import OLLAMA_MODELS_DATA
27
16
 
28
17
  if TYPE_CHECKING:
29
18
  from ollama import ChatResponse
@@ -36,26 +25,46 @@ class OllamaModel(DeepEvalBaseLLM):
36
25
  self,
37
26
  model: Optional[str] = None,
38
27
  base_url: Optional[str] = None,
39
- temperature: float = 0,
28
+ temperature: Optional[float] = None,
40
29
  generation_kwargs: Optional[Dict] = None,
41
30
  **kwargs,
42
31
  ):
43
32
  settings = get_settings()
44
- model = model or settings.LOCAL_MODEL_NAME
45
- self.base_url = (
46
- base_url
47
- or (
48
- settings.LOCAL_MODEL_BASE_URL
49
- and str(settings.LOCAL_MODEL_BASE_URL)
50
- )
51
- or "http://localhost:11434"
33
+ model = model or settings.OLLAMA_MODEL_NAME
34
+ self.model_data = OLLAMA_MODELS_DATA.get(model)
35
+
36
+ if base_url is not None:
37
+ self.base_url = str(base_url).rstrip("/")
38
+ elif settings.LOCAL_MODEL_BASE_URL is not None:
39
+ self.base_url = str(settings.LOCAL_MODEL_BASE_URL).rstrip("/")
40
+ else:
41
+ self.base_url = "http://localhost:11434"
42
+
43
+ if temperature is not None:
44
+ temperature = float(temperature)
45
+ elif settings.TEMPERATURE is not None:
46
+ temperature = settings.TEMPERATURE
47
+ else:
48
+ temperature = 0.0
49
+
50
+ # validation
51
+ model = require_param(
52
+ model,
53
+ provider_label="OllamaModel",
54
+ env_var_name="LOCAL_MODEL_NAME",
55
+ param_hint="model",
52
56
  )
57
+
53
58
  if temperature < 0:
54
- raise ValueError("Temperature must be >= 0.")
59
+ raise DeepEvalError("Temperature must be >= 0.")
55
60
  self.temperature = temperature
56
61
  # Keep sanitized kwargs for client call to strip legacy keys
57
62
  self.kwargs = kwargs
58
- self.generation_kwargs = generation_kwargs or {}
63
+ self.kwargs.pop("temperature", None)
64
+
65
+ self.generation_kwargs = dict(generation_kwargs or {})
66
+ self.generation_kwargs.pop("temperature", None)
67
+
59
68
  super().__init__(model)
60
69
 
61
70
  ###############################################
@@ -65,7 +74,7 @@ class OllamaModel(DeepEvalBaseLLM):
65
74
  @retry_ollama
66
75
  def generate(
67
76
  self, prompt: str, schema: Optional[BaseModel] = None
68
- ) -> Tuple[Union[str, Dict], float]:
77
+ ) -> Tuple[Union[str, BaseModel], float]:
69
78
  chat_model = self.load_model()
70
79
 
71
80
  if check_if_multimodal(prompt):
@@ -73,7 +82,6 @@ class OllamaModel(DeepEvalBaseLLM):
73
82
  messages = self.generate_messages(prompt)
74
83
  else:
75
84
  messages = [{"role": "user", "content": prompt}]
76
- print(messages)
77
85
 
78
86
  response: ChatResponse = chat_model.chat(
79
87
  model=self.name,
@@ -96,7 +104,7 @@ class OllamaModel(DeepEvalBaseLLM):
96
104
  @retry_ollama
97
105
  async def a_generate(
98
106
  self, prompt: str, schema: Optional[BaseModel] = None
99
- ) -> Tuple[str, float]:
107
+ ) -> Tuple[Union[str, BaseModel], float]:
100
108
  chat_model = self.load_model(async_mode=True)
101
109
 
102
110
  if check_if_multimodal(prompt):
@@ -127,60 +135,78 @@ class OllamaModel(DeepEvalBaseLLM):
127
135
  self, multimodal_input: List[Union[str, MLLMImage]] = []
128
136
  ):
129
137
  messages = []
130
- for ele in multimodal_input:
131
- if isinstance(ele, str):
138
+
139
+ for element in multimodal_input:
140
+ if isinstance(element, str):
141
+ messages.append(
142
+ {
143
+ "role": "user",
144
+ "content": element,
145
+ }
146
+ )
147
+ elif isinstance(element, MLLMImage):
148
+ if element.url and not element.local:
149
+ import requests
150
+ from PIL import Image
151
+ import io
152
+
153
+ settings = get_settings()
154
+ try:
155
+ response = requests.get(
156
+ element.url,
157
+ stream=True,
158
+ timeout=(
159
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
160
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
161
+ ),
162
+ )
163
+ response.raise_for_status()
164
+
165
+ # Convert to JPEG and encode
166
+ image = Image.open(io.BytesIO(response.content))
167
+ buffered = io.BytesIO()
168
+
169
+ # Convert RGBA/LA/P to RGB for JPEG
170
+ if image.mode in ("RGBA", "LA", "P"):
171
+ image = image.convert("RGB")
172
+
173
+ image.save(buffered, format="JPEG")
174
+ img_b64 = base64.b64encode(buffered.getvalue()).decode()
175
+
176
+ except (requests.exceptions.RequestException, OSError) as e:
177
+ print(f"Image fetch/encode failed: {e}")
178
+ raise
179
+ else:
180
+ element.ensure_images_loaded()
181
+ img_b64 = element.dataBase64
182
+
132
183
  messages.append(
133
184
  {
134
185
  "role": "user",
135
- "content": ele,
186
+ "images": [img_b64],
136
187
  }
137
188
  )
138
- elif isinstance(ele, MLLMImage):
139
- img_b64 = self.convert_to_base64(ele.url, ele.local)
140
- if img_b64 is not None:
141
- messages.append(
142
- {
143
- "role": "user",
144
- "images": [img_b64],
145
- }
146
- )
189
+
147
190
  return messages
148
191
 
149
192
  ###############################################
150
- # Utilities
193
+ # Capabilities
151
194
  ###############################################
152
195
 
153
- def convert_to_base64(self, image_source: str, is_local: bool) -> str:
154
- from PIL import Image
196
+ def supports_log_probs(self) -> Union[bool, None]:
197
+ return self.model_data.supports_log_probs
155
198
 
156
- settings = get_settings()
157
- try:
158
- if not is_local:
159
- response = requests.get(
160
- image_source,
161
- stream=True,
162
- timeout=(
163
- settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
164
- settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
165
- ),
166
- )
167
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
168
- image = Image.open(io.BytesIO(response.content))
169
- else:
170
- image = Image.open(image_source)
171
-
172
- buffered = io.BytesIO()
173
- image.save(buffered, format="JPEG")
174
- img_str = base64.b64encode(buffered.getvalue()).decode()
175
- return img_str
176
-
177
- except (requests.exceptions.RequestException, OSError) as e:
178
- # Log, then rethrow so @retry_ollama can retry generate_messages() on network failures
179
- print(f"Image fetch/encode failed: {e}")
180
- raise
181
- except Exception as e:
182
- print(f"Error converting image to base64: {e}")
183
- return None
199
+ def supports_temperature(self) -> Union[bool, None]:
200
+ return self.model_data.supports_temperature
201
+
202
+ def supports_multimodal(self) -> Union[bool, None]:
203
+ return self.model_data.supports_multimodal
204
+
205
+ def supports_structured_outputs(self) -> Union[bool, None]:
206
+ return self.model_data.supports_structured_outputs
207
+
208
+ def supports_json_mode(self) -> Union[bool, None]:
209
+ return self.model_data.supports_json
184
210
 
185
211
  ###############################################
186
212
  # Model
@@ -207,10 +233,5 @@ class OllamaModel(DeepEvalBaseLLM):
207
233
  )
208
234
  return cls(**kw)
209
235
 
210
- def supports_multimodal(self):
211
- if self.name in valid_multimodal_models:
212
- return True
213
- return False
214
-
215
236
  def get_model_name(self):
216
237
  return f"{self.name} (Ollama)"