deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,71 +1,159 @@
1
1
  from openai.types.chat.chat_completion import ChatCompletion
2
2
  from openai import AzureOpenAI, AsyncAzureOpenAI
3
- from typing import Optional, Tuple, Union, Dict
3
+ from typing import Optional, Tuple, Union, Dict, List
4
4
  from pydantic import BaseModel, SecretStr
5
5
 
6
+ from deepeval.errors import DeepEvalError
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.models import DeepEvalBaseLLM
8
- from deepeval.models.llms.openai_model import (
9
- structured_outputs_models,
10
- json_mode_models,
11
- model_pricing,
12
- )
9
+ from deepeval.models.llms.constants import OPENAI_MODELS_DATA
13
10
  from deepeval.models.retry_policy import (
14
11
  create_retry_decorator,
15
12
  sdk_retries_for,
16
13
  )
17
-
18
- from deepeval.models.llms.utils import trim_and_load_json
19
- from deepeval.models.utils import parse_model_name, require_secret_api_key
14
+ from deepeval.test_case import MLLMImage
15
+ from deepeval.utils import (
16
+ convert_to_multi_modal_array,
17
+ check_if_multimodal,
18
+ require_param,
19
+ )
20
+ from deepeval.models.llms.utils import (
21
+ trim_and_load_json,
22
+ )
23
+ from deepeval.models.utils import (
24
+ parse_model_name,
25
+ require_secret_api_key,
26
+ require_costs,
27
+ normalize_kwargs_and_extract_aliases,
28
+ )
20
29
  from deepeval.constants import ProviderSlug as PS
21
30
 
22
-
23
31
  retry_azure = create_retry_decorator(PS.AZURE)
24
32
 
33
+ _ALIAS_MAP = {
34
+ "api_key": ["azure_openai_api_key"],
35
+ "base_url": ["azure_endpoint"],
36
+ }
37
+
25
38
 
26
39
  class AzureOpenAIModel(DeepEvalBaseLLM):
27
40
  def __init__(
28
41
  self,
42
+ model: Optional[str] = None,
43
+ api_key: Optional[str] = None,
44
+ base_url: Optional[str] = None,
45
+ temperature: Optional[float] = None,
46
+ cost_per_input_token: Optional[float] = None,
47
+ cost_per_output_token: Optional[float] = None,
29
48
  deployment_name: Optional[str] = None,
30
- model_name: Optional[str] = None,
31
- azure_openai_api_key: Optional[str] = None,
32
- openai_api_version: Optional[str] = None,
33
- azure_endpoint: Optional[str] = None,
34
- temperature: float = 0,
49
+ api_version: Optional[str] = None,
35
50
  generation_kwargs: Optional[Dict] = None,
36
51
  **kwargs,
37
52
  ):
38
53
  settings = get_settings()
54
+ normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
55
+ "AzureOpenAIModel",
56
+ kwargs,
57
+ _ALIAS_MAP,
58
+ )
59
+
60
+ # re-map deprecated keywords to re-named positional args
61
+ if api_key is None and "api_key" in alias_values:
62
+ api_key = alias_values["api_key"]
63
+ if base_url is None and "base_url" in alias_values:
64
+ base_url = alias_values["base_url"]
39
65
 
40
66
  # fetch Azure deployment parameters
41
- model_name = model_name or settings.AZURE_MODEL_NAME
42
- self.deployment_name = deployment_name or settings.AZURE_DEPLOYMENT_NAME
67
+ model = model or settings.AZURE_MODEL_NAME
68
+ deployment_name = deployment_name or settings.AZURE_DEPLOYMENT_NAME
43
69
 
44
- if azure_openai_api_key is not None:
70
+ if api_key is not None:
45
71
  # keep it secret, keep it safe from serializings, logging and alike
46
- self.azure_openai_api_key: SecretStr | None = SecretStr(
47
- azure_openai_api_key
48
- )
72
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
49
73
  else:
50
- self.azure_openai_api_key = settings.AZURE_OPENAI_API_KEY
74
+ self.api_key = settings.AZURE_OPENAI_API_KEY
51
75
 
52
- self.openai_api_version = (
53
- openai_api_version or settings.OPENAI_API_VERSION
76
+ api_version = api_version or settings.OPENAI_API_VERSION
77
+ if base_url is not None:
78
+ base_url = str(base_url).rstrip("/")
79
+ elif settings.AZURE_OPENAI_ENDPOINT is not None:
80
+ base_url = str(settings.AZURE_OPENAI_ENDPOINT).rstrip("/")
81
+
82
+ if temperature is not None:
83
+ temperature = float(temperature)
84
+ elif settings.TEMPERATURE is not None:
85
+ temperature = settings.TEMPERATURE
86
+ else:
87
+ temperature = 0.0
88
+
89
+ cost_per_input_token = (
90
+ cost_per_input_token
91
+ if cost_per_input_token is not None
92
+ else settings.OPENAI_COST_PER_INPUT_TOKEN
93
+ )
94
+ cost_per_output_token = (
95
+ cost_per_output_token
96
+ if cost_per_output_token is not None
97
+ else settings.OPENAI_COST_PER_OUTPUT_TOKEN
54
98
  )
55
- self.azure_endpoint = (
56
- azure_endpoint
57
- or settings.AZURE_OPENAI_ENDPOINT
58
- and str(settings.AZURE_OPENAI_ENDPOINT)
99
+
100
+ # validation
101
+ model = require_param(
102
+ model,
103
+ provider_label="AzureOpenAIModel",
104
+ env_var_name="AZURE_MODEL_NAME",
105
+ param_hint="model",
59
106
  )
60
107
 
108
+ self.deployment_name = require_param(
109
+ deployment_name,
110
+ provider_label="AzureOpenAIModel",
111
+ env_var_name="AZURE_DEPLOYMENT_NAME",
112
+ param_hint="deployment_name",
113
+ )
114
+
115
+ self.base_url = require_param(
116
+ base_url,
117
+ provider_label="AzureOpenAIModel",
118
+ env_var_name="AZURE_OPENAI_ENDPOINT",
119
+ param_hint="base_url",
120
+ )
121
+
122
+ self.api_version = require_param(
123
+ api_version,
124
+ provider_label="AzureOpenAIModel",
125
+ env_var_name="OPENAI_API_VERSION",
126
+ param_hint="api_version",
127
+ )
128
+
129
+ self.model_data = OPENAI_MODELS_DATA.get(model)
130
+ cost_per_input_token, cost_per_output_token = require_costs(
131
+ self.model_data,
132
+ model,
133
+ "OPENAI_COST_PER_INPUT_TOKEN",
134
+ "OPENAI_COST_PER_OUTPUT_TOKEN",
135
+ cost_per_input_token,
136
+ cost_per_output_token,
137
+ )
138
+ self.model_data.input_price = cost_per_input_token
139
+ self.model_data.output_price = cost_per_output_token
140
+
61
141
  if temperature < 0:
62
- raise ValueError("Temperature must be >= 0.")
142
+ raise DeepEvalError("Temperature must be >= 0.")
63
143
  self.temperature = temperature
64
144
 
65
- # args and kwargs will be passed to the underlying model, in load_model function
66
- self.kwargs = kwargs
67
- self.generation_kwargs = generation_kwargs or {}
68
- super().__init__(parse_model_name(model_name))
145
+ # Keep sanitized kwargs for client call to strip legacy keys
146
+ self.kwargs = normalized_kwargs
147
+ self.kwargs.pop(
148
+ "temperature", None
149
+ ) # to avoid duplicate with self.temperature
150
+
151
+ self.generation_kwargs = dict(generation_kwargs or {})
152
+ self.generation_kwargs.pop(
153
+ "temperature", None
154
+ ) # to avoid duplicate with self.temperature
155
+
156
+ super().__init__(parse_model_name(model))
69
157
 
70
158
  ###############################################
71
159
  # Other generate functions
@@ -74,17 +162,23 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
74
162
  @retry_azure
75
163
  def generate(
76
164
  self, prompt: str, schema: Optional[BaseModel] = None
77
- ) -> Tuple[Union[str, Dict], float]:
165
+ ) -> Tuple[Union[str, BaseModel], float]:
78
166
  client = self.load_model(async_mode=False)
167
+
168
+ if check_if_multimodal(prompt):
169
+ prompt = convert_to_multi_modal_array(input=prompt)
170
+ content = self.generate_content(prompt)
171
+ else:
172
+ content = [{"type": "text", "text": prompt}]
173
+
79
174
  if schema:
80
- if self.model_name in structured_outputs_models:
175
+ if self.model_data.supports_structured_outputs:
81
176
  completion = client.beta.chat.completions.parse(
82
177
  model=self.deployment_name,
83
- messages=[
84
- {"role": "user", "content": prompt},
85
- ],
178
+ messages=[{"role": "user", "content": content}],
86
179
  response_format=schema,
87
180
  temperature=self.temperature,
181
+ **self.generation_kwargs,
88
182
  )
89
183
  structured_output: BaseModel = completion.choices[
90
184
  0
@@ -94,14 +188,15 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
94
188
  completion.usage.completion_tokens,
95
189
  )
96
190
  return structured_output, cost
97
- if self.model_name in json_mode_models:
191
+ if self.model_data.supports_json:
98
192
  completion = client.beta.chat.completions.parse(
99
193
  model=self.deployment_name,
100
194
  messages=[
101
- {"role": "user", "content": prompt},
195
+ {"role": "user", "content": content},
102
196
  ],
103
197
  response_format={"type": "json_object"},
104
198
  temperature=self.temperature,
199
+ **self.generation_kwargs,
105
200
  )
106
201
  json_output = trim_and_load_json(
107
202
  completion.choices[0].message.content
@@ -115,7 +210,7 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
115
210
  completion = client.chat.completions.create(
116
211
  model=self.deployment_name,
117
212
  messages=[
118
- {"role": "user", "content": prompt},
213
+ {"role": "user", "content": content},
119
214
  ],
120
215
  temperature=self.temperature,
121
216
  **self.generation_kwargs,
@@ -135,15 +230,21 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
135
230
  self, prompt: str, schema: Optional[BaseModel] = None
136
231
  ) -> Tuple[Union[str, BaseModel], float]:
137
232
  client = self.load_model(async_mode=True)
233
+
234
+ if check_if_multimodal(prompt):
235
+ prompt = convert_to_multi_modal_array(input=prompt)
236
+ content = self.generate_content(prompt)
237
+ else:
238
+ content = [{"type": "text", "text": prompt}]
239
+
138
240
  if schema:
139
- if self.model_name in structured_outputs_models:
241
+ if self.model_data.supports_structured_outputs:
140
242
  completion = await client.beta.chat.completions.parse(
141
243
  model=self.deployment_name,
142
- messages=[
143
- {"role": "user", "content": prompt},
144
- ],
244
+ messages=[{"role": "user", "content": content}],
145
245
  response_format=schema,
146
246
  temperature=self.temperature,
247
+ **self.generation_kwargs,
147
248
  )
148
249
  structured_output: BaseModel = completion.choices[
149
250
  0
@@ -153,11 +254,11 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
153
254
  completion.usage.completion_tokens,
154
255
  )
155
256
  return structured_output, cost
156
- if self.model_name in json_mode_models:
257
+ if self.model_data.supports_json:
157
258
  completion = await client.beta.chat.completions.parse(
158
259
  model=self.deployment_name,
159
260
  messages=[
160
- {"role": "user", "content": prompt},
261
+ {"role": "user", "content": content},
161
262
  ],
162
263
  response_format={"type": "json_object"},
163
264
  temperature=self.temperature,
@@ -175,7 +276,7 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
175
276
  completion = await client.chat.completions.create(
176
277
  model=self.deployment_name,
177
278
  messages=[
178
- {"role": "user", "content": prompt},
279
+ {"role": "user", "content": content},
179
280
  ],
180
281
  temperature=self.temperature,
181
282
  **self.generation_kwargs,
@@ -203,9 +304,14 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
203
304
  ) -> Tuple[ChatCompletion, float]:
204
305
  # Generate completion
205
306
  client = self.load_model(async_mode=False)
307
+ if check_if_multimodal(prompt):
308
+ prompt = convert_to_multi_modal_array(input=prompt)
309
+ content = self.generate_content(prompt)
310
+ else:
311
+ content = [{"type": "text", "text": prompt}]
206
312
  completion = client.chat.completions.create(
207
313
  model=self.deployment_name,
208
- messages=[{"role": "user", "content": prompt}],
314
+ messages=[{"role": "user", "content": content}],
209
315
  temperature=self.temperature,
210
316
  logprobs=True,
211
317
  top_logprobs=top_logprobs,
@@ -226,9 +332,14 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
226
332
  ) -> Tuple[ChatCompletion, float]:
227
333
  # Generate completion
228
334
  client = self.load_model(async_mode=True)
335
+ if check_if_multimodal(prompt):
336
+ prompt = convert_to_multi_modal_array(input=prompt)
337
+ content = self.generate_content(prompt)
338
+ else:
339
+ content = [{"type": "text", "text": prompt}]
229
340
  completion = await client.chat.completions.create(
230
341
  model=self.deployment_name,
231
- messages=[{"role": "user", "content": prompt}],
342
+ messages=[{"role": "user", "content": content}],
232
343
  temperature=self.temperature,
233
344
  logprobs=True,
234
345
  top_logprobs=top_logprobs,
@@ -241,22 +352,66 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
241
352
 
242
353
  return completion, cost
243
354
 
355
+ def generate_content(
356
+ self, multimodal_input: Optional[List[Union[str, MLLMImage]]] = None
357
+ ):
358
+ multimodal_input = [] if multimodal_input is None else multimodal_input
359
+ content = []
360
+ for element in multimodal_input:
361
+ if isinstance(element, str):
362
+ content.append({"type": "text", "text": element})
363
+ elif isinstance(element, MLLMImage):
364
+ if element.url and not element.local:
365
+ content.append(
366
+ {
367
+ "type": "image_url",
368
+ "image_url": {"url": element.url},
369
+ }
370
+ )
371
+ else:
372
+ element.ensure_images_loaded()
373
+ data_uri = (
374
+ f"data:{element.mimeType};base64,{element.dataBase64}"
375
+ )
376
+ content.append(
377
+ {
378
+ "type": "image_url",
379
+ "image_url": {"url": data_uri},
380
+ }
381
+ )
382
+ return content
383
+
244
384
  ###############################################
245
385
  # Utilities
246
386
  ###############################################
247
387
 
248
388
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
249
- pricing = model_pricing.get(self.model_name, model_pricing["gpt-4.1"])
250
- input_cost = input_tokens * pricing["input"]
251
- output_cost = output_tokens * pricing["output"]
389
+ input_cost = input_tokens * self.model_data.input_price
390
+ output_cost = output_tokens * self.model_data.output_price
252
391
  return input_cost + output_cost
253
392
 
254
393
  ###############################################
255
- # Model
394
+ # Capabilities
256
395
  ###############################################
257
396
 
258
- def get_model_name(self):
259
- return f"Azure OpenAI ({self.model_name})"
397
+ def supports_log_probs(self) -> Union[bool, None]:
398
+ return self.model_data.supports_log_probs
399
+
400
+ def supports_temperature(self) -> Union[bool, None]:
401
+ return self.model_data.supports_temperature
402
+
403
+ def supports_multimodal(self) -> Union[bool, None]:
404
+ return self.model_data.supports_multimodal
405
+
406
+ def supports_structured_outputs(self) -> Union[bool, None]:
407
+ return self.model_data.supports_structured_outputs
408
+
409
+ def supports_json_mode(self) -> Union[bool, None]:
410
+ return self.model_data.supports_json
411
+
412
+ ###############################################
413
+ # Model
414
+ ###############################################
260
415
 
261
416
  def load_model(self, async_mode: bool = False):
262
417
  if not async_mode:
@@ -276,16 +431,16 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
276
431
 
277
432
  def _build_client(self, cls):
278
433
  api_key = require_secret_api_key(
279
- self.azure_openai_api_key,
434
+ self.api_key,
280
435
  provider_label="AzureOpenAI",
281
436
  env_var_name="AZURE_OPENAI_API_KEY",
282
- param_hint="`azure_openai_api_key` to AzureOpenAIModel(...)",
437
+ param_hint="`api_key` to AzureOpenAIModel(...)",
283
438
  )
284
439
 
285
440
  kw = dict(
286
441
  api_key=api_key,
287
- api_version=self.openai_api_version,
288
- azure_endpoint=self.azure_endpoint,
442
+ api_version=self.api_version,
443
+ azure_endpoint=self.base_url,
289
444
  azure_deployment=self.deployment_name,
290
445
  **self._client_kwargs(),
291
446
  )
@@ -297,3 +452,6 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
297
452
  kw.pop("max_retries", None)
298
453
  return cls(**kw)
299
454
  raise
455
+
456
+ def get_model_name(self):
457
+ return f"{self.name} (Azure)"