deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -9,10 +9,17 @@ from tenacity import (
9
9
  RetryCallState,
10
10
  )
11
11
 
12
+ from deepeval.errors import DeepEvalError
12
13
  from deepeval.config.settings import get_settings
13
- from deepeval.models.utils import require_secret_api_key
14
+ from deepeval.models.utils import (
15
+ require_secret_api_key,
16
+ normalize_kwargs_and_extract_aliases,
17
+ )
18
+ from deepeval.test_case import MLLMImage
19
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
14
20
  from deepeval.models import DeepEvalBaseLLM
15
21
  from deepeval.models.llms.utils import trim_and_load_json
22
+ from deepeval.utils import require_param
16
23
 
17
24
 
18
25
  def log_retry_error(retry_state: RetryCallState):
@@ -27,6 +34,10 @@ retryable_exceptions = (
27
34
  Exception, # LiteLLM handles specific exceptions internally
28
35
  )
29
36
 
37
+ _ALIAS_MAP = {
38
+ "base_url": ["api_base"],
39
+ }
40
+
30
41
 
31
42
  class LiteLLMModel(DeepEvalBaseLLM):
32
43
  EXP_BASE: int = 2
@@ -39,24 +50,29 @@ class LiteLLMModel(DeepEvalBaseLLM):
39
50
  self,
40
51
  model: Optional[str] = None,
41
52
  api_key: Optional[str] = None,
42
- api_base: Optional[str] = None,
43
- temperature: float = 0,
53
+ base_url: Optional[str] = None,
54
+ temperature: Optional[float] = None,
44
55
  generation_kwargs: Optional[Dict] = None,
45
56
  **kwargs,
46
57
  ):
47
-
48
58
  settings = get_settings()
59
+ normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
60
+ "LiteLLMModel",
61
+ kwargs,
62
+ _ALIAS_MAP,
63
+ )
64
+
65
+ # re-map depricated keywords to re-named positional args
66
+ if base_url is None and "base_url" in alias_values:
67
+ base_url = alias_values["base_url"]
68
+
49
69
  # Get model name from parameter or key file
50
- model_name = model or settings.LITELLM_MODEL_NAME
51
- if not model_name:
52
- raise ValueError(
53
- "Model name must be provided either through parameter or set-litellm command"
54
- )
70
+ model = model or settings.LITELLM_MODEL_NAME
55
71
 
56
72
  # Get API key from parameter, or settings
57
73
  if api_key is not None:
58
74
  # keep it secret, keep it safe from serializings, logging and aolike
59
- self.api_key: SecretStr | None = SecretStr(api_key)
75
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
60
76
  else:
61
77
  self.api_key = (
62
78
  settings.LITELLM_API_KEY
@@ -67,8 +83,8 @@ class LiteLLMModel(DeepEvalBaseLLM):
67
83
  )
68
84
 
69
85
  # Get API base from parameter, key file, or environment variable
70
- self.api_base = (
71
- api_base
86
+ base_url = (
87
+ base_url
72
88
  or (
73
89
  str(settings.LITELLM_API_BASE)
74
90
  if settings.LITELLM_API_BASE is not None
@@ -80,14 +96,37 @@ class LiteLLMModel(DeepEvalBaseLLM):
80
96
  else None
81
97
  )
82
98
  )
99
+ self.base_url = (
100
+ str(base_url).rstrip("/") if base_url is not None else None
101
+ )
102
+
103
+ if temperature is not None:
104
+ temperature = float(temperature)
105
+ elif settings.TEMPERATURE is not None:
106
+ temperature = settings.TEMPERATURE
107
+ else:
108
+ temperature = 0.0
109
+
110
+ # validation
111
+ model = require_param(
112
+ model,
113
+ provider_label="LiteLLMModel",
114
+ env_var_name="LITELLM_MODEL_NAME",
115
+ param_hint="model",
116
+ )
83
117
 
84
118
  if temperature < 0:
85
- raise ValueError("Temperature must be >= 0.")
119
+ raise DeepEvalError("Temperature must be >= 0.")
86
120
  self.temperature = temperature
87
- self.kwargs = kwargs
88
- self.generation_kwargs = generation_kwargs or {}
121
+ # Keep sanitized kwargs for client call to strip legacy keys
122
+ self.kwargs = normalized_kwargs
123
+ self.kwargs.pop("temperature", None)
124
+
125
+ self.generation_kwargs = dict(generation_kwargs or {})
126
+ self.generation_kwargs.pop("temperature", None)
127
+
89
128
  self.evaluation_cost = 0.0 # Initialize cost to 0.0
90
- super().__init__(model_name)
129
+ super().__init__(model)
91
130
 
92
131
  @retry(
93
132
  wait=wait_exponential_jitter(
@@ -99,12 +138,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
99
138
  )
100
139
  def generate(
101
140
  self, prompt: str, schema: Optional[BaseModel] = None
102
- ) -> Union[str, Dict, Tuple[str, float]]:
141
+ ) -> Tuple[Union[str, BaseModel], float]:
142
+
103
143
  from litellm import completion
104
144
 
145
+ if check_if_multimodal(prompt):
146
+ prompt = convert_to_multi_modal_array(input=prompt)
147
+ content = self.generate_content(prompt)
148
+ else:
149
+ content = [{"type": "text", "text": prompt}]
150
+
105
151
  completion_params = {
106
- "model": self.model_name,
107
- "messages": [{"role": "user", "content": prompt}],
152
+ "model": self.name,
153
+ "messages": [{"role": "user", "content": content}],
108
154
  "temperature": self.temperature,
109
155
  }
110
156
 
@@ -112,12 +158,12 @@ class LiteLLMModel(DeepEvalBaseLLM):
112
158
  api_key = require_secret_api_key(
113
159
  self.api_key,
114
160
  provider_label="LiteLLM",
115
- env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
161
+ env_var_name="LITELLM_API_KEY|LITELLM_PROXY_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
116
162
  param_hint="`api_key` to LiteLLMModel(...)",
117
163
  )
118
164
  completion_params["api_key"] = api_key
119
- if self.api_base:
120
- completion_params["api_base"] = self.api_base
165
+ if self.base_url:
166
+ completion_params["api_base"] = self.base_url
121
167
 
122
168
  # Add schema if provided
123
169
  if schema:
@@ -154,12 +200,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
154
200
  )
155
201
  async def a_generate(
156
202
  self, prompt: str, schema: Optional[BaseModel] = None
157
- ) -> Union[str, Dict, Tuple[str, float]]:
203
+ ) -> Tuple[Union[str, BaseModel], float]:
204
+
158
205
  from litellm import acompletion
159
206
 
207
+ if check_if_multimodal(prompt):
208
+ prompt = convert_to_multi_modal_array(input=prompt)
209
+ content = self.generate_content(prompt)
210
+ else:
211
+ content = [{"type": "text", "text": prompt}]
212
+
160
213
  completion_params = {
161
- "model": self.model_name,
162
- "messages": [{"role": "user", "content": prompt}],
214
+ "model": self.name,
215
+ "messages": [{"role": "user", "content": content}],
163
216
  "temperature": self.temperature,
164
217
  }
165
218
 
@@ -171,8 +224,8 @@ class LiteLLMModel(DeepEvalBaseLLM):
171
224
  param_hint="`api_key` to LiteLLMModel(...)",
172
225
  )
173
226
  completion_params["api_key"] = api_key
174
- if self.api_base:
175
- completion_params["api_base"] = self.api_base
227
+ if self.base_url:
228
+ completion_params["api_base"] = self.base_url
176
229
 
177
230
  # Add schema if provided
178
231
  if schema:
@@ -221,12 +274,17 @@ class LiteLLMModel(DeepEvalBaseLLM):
221
274
  env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
222
275
  param_hint="`api_key` to LiteLLMModel(...)",
223
276
  )
277
+ if check_if_multimodal(prompt):
278
+ prompt = convert_to_multi_modal_array(input=prompt)
279
+ content = self.generate_content(prompt)
280
+ else:
281
+ content = [{"type": "text", "text": prompt}]
224
282
  completion_params = {
225
- "model": self.model_name,
226
- "messages": [{"role": "user", "content": prompt}],
283
+ "model": self.name,
284
+ "messages": [{"role": "user", "content": content}],
227
285
  "temperature": self.temperature,
228
286
  "api_key": api_key,
229
- "api_base": self.api_base,
287
+ "api_base": self.base_url,
230
288
  "logprobs": True,
231
289
  "top_logprobs": top_logprobs,
232
290
  }
@@ -262,12 +320,17 @@ class LiteLLMModel(DeepEvalBaseLLM):
262
320
  env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
263
321
  param_hint="`api_key` to LiteLLMModel(...)",
264
322
  )
323
+ if check_if_multimodal(prompt):
324
+ prompt = convert_to_multi_modal_array(input=prompt)
325
+ content = self.generate_content(prompt)
326
+ else:
327
+ content = [{"type": "text", "text": prompt}]
265
328
  completion_params = {
266
- "model": self.model_name,
267
- "messages": [{"role": "user", "content": prompt}],
329
+ "model": self.name,
330
+ "messages": [{"role": "user", "content": content}],
268
331
  "temperature": self.temperature,
269
332
  "api_key": api_key,
270
- "api_base": self.api_base,
333
+ "api_base": self.base_url,
271
334
  "logprobs": True,
272
335
  "top_logprobs": top_logprobs,
273
336
  }
@@ -302,12 +365,12 @@ class LiteLLMModel(DeepEvalBaseLLM):
302
365
  param_hint="`api_key` to LiteLLMModel(...)",
303
366
  )
304
367
  completion_params = {
305
- "model": self.model_name,
368
+ "model": self.name,
306
369
  "messages": [{"role": "user", "content": prompt}],
307
370
  "temperature": temperature,
308
371
  "n": n,
309
372
  "api_key": api_key,
310
- "api_base": self.api_base,
373
+ "api_base": self.base_url,
311
374
  }
312
375
  completion_params.update(self.kwargs)
313
376
 
@@ -320,6 +383,34 @@ class LiteLLMModel(DeepEvalBaseLLM):
320
383
  logging.error(f"Error in LiteLLM generate_samples: {e}")
321
384
  raise
322
385
 
386
+ def generate_content(
387
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
388
+ ):
389
+ content = []
390
+ for element in multimodal_input:
391
+ if isinstance(element, str):
392
+ content.append({"type": "text", "text": element})
393
+ elif isinstance(element, MLLMImage):
394
+ if element.url and not element.local:
395
+ content.append(
396
+ {
397
+ "type": "image_url",
398
+ "image_url": {"url": element.url},
399
+ }
400
+ )
401
+ else:
402
+ element.ensure_images_loaded()
403
+ data_uri = (
404
+ f"data:{element.mimeType};base64,{element.dataBase64}"
405
+ )
406
+ content.append(
407
+ {
408
+ "type": "image_url",
409
+ "image_url": {"url": data_uri},
410
+ }
411
+ )
412
+ return content
413
+
323
414
  def calculate_cost(self, response: Any) -> float:
324
415
  """Calculate the cost of the response based on token usage."""
325
416
  try:
@@ -353,8 +444,8 @@ class LiteLLMModel(DeepEvalBaseLLM):
353
444
  def get_model_name(self) -> str:
354
445
  from litellm import get_llm_provider
355
446
 
356
- provider = get_llm_provider(self.model_name)
357
- return f"{self.model_name} ({provider})"
447
+ provider = get_llm_provider(self.name)
448
+ return f"{self.name} ({provider})"
358
449
 
359
450
  def load_model(self, async_mode: bool = False):
360
451
  """
@@ -369,3 +460,6 @@ class LiteLLMModel(DeepEvalBaseLLM):
369
460
  None as LiteLLM handles client creation internally
370
461
  """
371
462
  return None
463
+
464
+ def supports_multimodal(self):
465
+ return True
@@ -1,17 +1,26 @@
1
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
2
2
  from pydantic import BaseModel, SecretStr
3
3
  from openai import OpenAI, AsyncOpenAI
4
4
  from openai.types.chat import ChatCompletion
5
5
 
6
+ from deepeval.errors import DeepEvalError
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.models.retry_policy import (
8
9
  create_retry_decorator,
9
10
  sdk_retries_for,
10
11
  )
11
12
  from deepeval.models.llms.utils import trim_and_load_json
12
- from deepeval.models.utils import require_secret_api_key
13
+ from deepeval.models.utils import (
14
+ require_secret_api_key,
15
+ )
13
16
  from deepeval.models import DeepEvalBaseLLM
14
17
  from deepeval.constants import ProviderSlug as PS
18
+ from deepeval.test_case import MLLMImage
19
+ from deepeval.utils import (
20
+ check_if_multimodal,
21
+ convert_to_multi_modal_array,
22
+ require_param,
23
+ )
15
24
 
16
25
 
17
26
  # consistent retry rules
@@ -22,47 +31,75 @@ class LocalModel(DeepEvalBaseLLM):
22
31
  def __init__(
23
32
  self,
24
33
  model: Optional[str] = None,
25
- base_url: Optional[str] = None,
26
34
  api_key: Optional[str] = None,
27
- temperature: float = 0,
35
+ base_url: Optional[str] = None,
36
+ temperature: Optional[float] = None,
28
37
  format: Optional[str] = None,
29
38
  generation_kwargs: Optional[Dict] = None,
30
39
  **kwargs,
31
40
  ):
32
41
  settings = get_settings()
33
42
 
34
- model_name = model or settings.LOCAL_MODEL_NAME
43
+ model = model or settings.LOCAL_MODEL_NAME
35
44
  if api_key is not None:
36
- # keep it secret, keep it safe from serializings, logging and alike
37
- self.local_model_api_key: SecretStr | None = SecretStr(api_key)
45
+ self.local_model_api_key: Optional[SecretStr] = SecretStr(api_key)
38
46
  else:
39
47
  self.local_model_api_key = settings.LOCAL_MODEL_API_KEY
40
48
 
49
+ base_url = (
50
+ base_url if base_url is not None else settings.LOCAL_MODEL_BASE_URL
51
+ )
41
52
  self.base_url = (
42
- base_url
43
- or settings.LOCAL_MODEL_BASE_URL
44
- and str(settings.LOCAL_MODEL_BASE_URL)
53
+ str(base_url).rstrip("/") if base_url is not None else None
45
54
  )
46
55
  self.format = format or settings.LOCAL_MODEL_FORMAT
56
+
57
+ if temperature is not None:
58
+ temperature = float(temperature)
59
+ elif settings.TEMPERATURE is not None:
60
+ temperature = settings.TEMPERATURE
61
+ else:
62
+ temperature = 0.0
63
+
64
+ # validation
65
+ model = require_param(
66
+ model,
67
+ provider_label="LocalModel",
68
+ env_var_name="LOCAL_MODEL_NAME",
69
+ param_hint="model",
70
+ )
71
+
47
72
  if temperature < 0:
48
- raise ValueError("Temperature must be >= 0.")
73
+ raise DeepEvalError("Temperature must be >= 0.")
49
74
  self.temperature = temperature
75
+
50
76
  self.kwargs = kwargs
51
- self.generation_kwargs = generation_kwargs or {}
52
- super().__init__(model_name)
77
+ self.kwargs.pop("temperature", None)
78
+
79
+ self.generation_kwargs = dict(generation_kwargs or {})
80
+ self.generation_kwargs.pop("temperature", None)
81
+
82
+ super().__init__(model)
53
83
 
54
84
  ###############################################
55
- # Other generate functions
85
+ # Generate functions
56
86
  ###############################################
57
87
 
58
88
  @retry_local
59
89
  def generate(
60
90
  self, prompt: str, schema: Optional[BaseModel] = None
61
- ) -> Tuple[Union[str, Dict], float]:
91
+ ) -> Tuple[Union[str, BaseModel], float]:
92
+
93
+ if check_if_multimodal(prompt):
94
+ prompt = convert_to_multi_modal_array(input=prompt)
95
+ content = self.generate_content(prompt)
96
+ else:
97
+ content = prompt
98
+
62
99
  client = self.load_model(async_mode=False)
63
100
  response: ChatCompletion = client.chat.completions.create(
64
- model=self.model_name,
65
- messages=[{"role": "user", "content": prompt}],
101
+ model=self.name,
102
+ messages=[{"role": "user", "content": content}],
66
103
  temperature=self.temperature,
67
104
  **self.generation_kwargs,
68
105
  )
@@ -77,11 +114,18 @@ class LocalModel(DeepEvalBaseLLM):
77
114
  @retry_local
78
115
  async def a_generate(
79
116
  self, prompt: str, schema: Optional[BaseModel] = None
80
- ) -> Tuple[Union[str, Dict], float]:
117
+ ) -> Tuple[Union[str, BaseModel], float]:
118
+
119
+ if check_if_multimodal(prompt):
120
+ prompt = convert_to_multi_modal_array(input=prompt)
121
+ content = self.generate_content(prompt)
122
+ else:
123
+ content = prompt
124
+
81
125
  client = self.load_model(async_mode=True)
82
126
  response: ChatCompletion = await client.chat.completions.create(
83
- model=self.model_name,
84
- messages=[{"role": "user", "content": prompt}],
127
+ model=self.name,
128
+ messages=[{"role": "user", "content": content}],
85
129
  temperature=self.temperature,
86
130
  **self.generation_kwargs,
87
131
  )
@@ -93,12 +137,72 @@ class LocalModel(DeepEvalBaseLLM):
93
137
  else:
94
138
  return res_content, 0.0
95
139
 
140
+ def generate_content(
141
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
142
+ ):
143
+ """
144
+ Converts multimodal input into OpenAI-compatible format.
145
+ Uses data URIs for all images since we can't guarantee local servers support URL fetching.
146
+ """
147
+ prompt = []
148
+ for element in multimodal_input:
149
+ if isinstance(element, str):
150
+ prompt.append({"type": "text", "text": element})
151
+ elif isinstance(element, MLLMImage):
152
+ # For local servers, use data URIs for both remote and local images
153
+ # Most local servers don't support fetching external URLs
154
+ if element.url and not element.local:
155
+ import requests
156
+ import base64
157
+
158
+ settings = get_settings()
159
+ try:
160
+ response = requests.get(
161
+ element.url,
162
+ timeout=(
163
+ settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
164
+ settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
165
+ ),
166
+ )
167
+ response.raise_for_status()
168
+
169
+ # Get mime type from response
170
+ mime_type = response.headers.get(
171
+ "content-type", element.mimeType or "image/jpeg"
172
+ )
173
+
174
+ # Encode to base64
175
+ b64_data = base64.b64encode(response.content).decode(
176
+ "utf-8"
177
+ )
178
+ data_uri = f"data:{mime_type};base64,{b64_data}"
179
+
180
+ except Exception as e:
181
+ raise ValueError(
182
+ f"Failed to fetch remote image {element.url}: {e}"
183
+ )
184
+ else:
185
+ element.ensure_images_loaded()
186
+ mime_type = element.mimeType or "image/jpeg"
187
+ data_uri = f"data:{mime_type};base64,{element.dataBase64}"
188
+
189
+ prompt.append(
190
+ {
191
+ "type": "image_url",
192
+ "image_url": {"url": data_uri},
193
+ }
194
+ )
195
+ return prompt
196
+
96
197
  ###############################################
97
198
  # Model
98
199
  ###############################################
99
200
 
100
201
  def get_model_name(self):
101
- return f"{self.model_name} (Local Model)"
202
+ return f"{self.name} (Local Model)"
203
+
204
+ def supports_multimodal(self):
205
+ return True
102
206
 
103
207
  def load_model(self, async_mode: bool = False):
104
208
  if not async_mode: