deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,13 +1,13 @@
1
- import base64
2
1
  from openai.types.chat.chat_completion import ChatCompletion
3
2
  from typing import Optional, Tuple, Union, Dict, List
4
3
  from deepeval.test_case import MLLMImage
5
4
  from pydantic import BaseModel, SecretStr
6
- from io import BytesIO
7
5
  from openai import (
8
6
  OpenAI,
9
7
  AsyncOpenAI,
10
8
  )
9
+
10
+ from deepeval.errors import DeepEvalError
11
11
  from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
12
12
  from deepeval.config.settings import get_settings
13
13
  from deepeval.constants import ProviderSlug as PS
@@ -15,6 +15,7 @@ from deepeval.models import DeepEvalBaseLLM
15
15
  from deepeval.models.llms.utils import trim_and_load_json
16
16
  from deepeval.models.utils import (
17
17
  parse_model_name,
18
+ require_costs,
18
19
  require_secret_api_key,
19
20
  normalize_kwargs_and_extract_aliases,
20
21
  )
@@ -22,208 +23,15 @@ from deepeval.models.retry_policy import (
22
23
  create_retry_decorator,
23
24
  sdk_retries_for,
24
25
  )
26
+ from deepeval.models.llms.constants import (
27
+ OPENAI_MODELS_DATA,
28
+ )
25
29
 
26
30
 
27
31
  retry_openai = create_retry_decorator(PS.OPENAI)
28
32
 
29
-
30
- valid_gpt_models = [
31
- "gpt-3.5-turbo",
32
- "gpt-3.5-turbo-0125",
33
- "gpt-3.5-turbo-1106",
34
- "gpt-4-0125-preview",
35
- "gpt-4-1106-preview",
36
- "gpt-4-turbo",
37
- "gpt-4-turbo-2024-04-09",
38
- "gpt-4-turbo-preview",
39
- "gpt-4o",
40
- "gpt-4o-2024-05-13",
41
- "gpt-4o-2024-08-06",
42
- "gpt-4o-2024-11-20",
43
- "gpt-4o-mini",
44
- "gpt-4o-mini-2024-07-18",
45
- "gpt-4-32k",
46
- "gpt-4-32k-0613",
47
- "gpt-4.1",
48
- "gpt-4.1-mini",
49
- "gpt-4.1-nano",
50
- "gpt-4.5-preview",
51
- "o1",
52
- "o1-preview",
53
- "o1-2024-12-17",
54
- "o1-preview-2024-09-12",
55
- "o1-mini",
56
- "o1-mini-2024-09-12",
57
- "o3-mini",
58
- "o3-mini-2025-01-31",
59
- "o4-mini",
60
- "o4-mini-2025-04-16",
61
- "gpt-4.5-preview-2025-02-27",
62
- "gpt-5",
63
- "gpt-5-2025-08-07",
64
- "gpt-5-mini",
65
- "gpt-5-mini-2025-08-07",
66
- "gpt-5-nano",
67
- "gpt-5-nano-2025-08-07",
68
- "gpt-5-chat-latest",
69
- ]
70
-
71
- unsupported_log_probs_gpt_models = [
72
- "o1",
73
- "o1-preview",
74
- "o1-2024-12-17",
75
- "o1-preview-2024-09-12",
76
- "o1-mini",
77
- "o1-mini-2024-09-12",
78
- "o3-mini",
79
- "o3-mini-2025-01-31",
80
- "o4-mini",
81
- "o4-mini-2025-04-16",
82
- "gpt-4.5-preview-2025-02-27",
83
- "gpt-5",
84
- "gpt-5-2025-08-07",
85
- "gpt-5-mini",
86
- "gpt-5-mini-2025-08-07",
87
- "gpt-5-nano",
88
- "gpt-5-nano-2025-08-07",
89
- "gpt-5-chat-latest",
90
- ]
91
-
92
- unsupported_log_probs_multimodal_gpt_models = [
93
- "o1",
94
- "o1-preview",
95
- "o1-2024-12-17",
96
- "o1-preview-2024-09-12",
97
- "gpt-4.5-preview-2025-02-27",
98
- "o4-mini",
99
- ]
100
-
101
- structured_outputs_models = [
102
- "gpt-4o",
103
- "gpt-4o-2024-05-13",
104
- "gpt-4o-2024-08-06",
105
- "gpt-4o-2024-11-20",
106
- "gpt-4o-mini",
107
- "gpt-4o-mini-2024-07-18",
108
- "gpt-4.1",
109
- "gpt-4.1-mini",
110
- "gpt-4.1-nano",
111
- "o1",
112
- "o1-preview",
113
- "o1-2024-12-17",
114
- "o3-mini",
115
- "o3-mini-2025-01-31",
116
- "o4-mini",
117
- "o4-mini-2025-04-16",
118
- "gpt-4.5-preview-2025-02-27",
119
- "gpt-5",
120
- "gpt-5-2025-08-07",
121
- "gpt-5-mini",
122
- "gpt-5-mini-2025-08-07",
123
- "gpt-5-nano",
124
- "gpt-5-nano-2025-08-07",
125
- ]
126
-
127
- json_mode_models = [
128
- "gpt-3.5-turbo",
129
- "gpt-3.5-turbo-0125",
130
- "gpt-3.5-turbo-1106",
131
- "gpt-4-0125-preview",
132
- "gpt-4-1106-preview",
133
- "gpt-4-turbo",
134
- "gpt-4-turbo-2024-04-09",
135
- "gpt-4-turbo-preview",
136
- "gpt-4-32k",
137
- "gpt-4-32k-0613",
138
- ]
139
-
140
- model_pricing = {
141
- "gpt-4o-mini": {"input": 0.150 / 1e6, "output": 0.600 / 1e6},
142
- "gpt-4o": {"input": 2.50 / 1e6, "output": 10.00 / 1e6},
143
- "gpt-4-turbo": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
144
- "gpt-4-turbo-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
145
- "gpt-4-0125-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
146
- "gpt-4-1106-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
147
- "gpt-4": {"input": 30.00 / 1e6, "output": 60.00 / 1e6},
148
- "gpt-4-32k": {"input": 60.00 / 1e6, "output": 120.00 / 1e6},
149
- "gpt-3.5-turbo-1106": {"input": 1.00 / 1e6, "output": 2.00 / 1e6},
150
- "gpt-3.5-turbo": {"input": 0.50 / 1e6, "output": 1.50 / 1e6},
151
- "gpt-3.5-turbo-16k": {"input": 3.00 / 1e6, "output": 4.00 / 1e6},
152
- "gpt-3.5-turbo-0125": {"input": 0.50 / 1e6, "output": 1.50 / 1e6},
153
- "gpt-3.5-turbo-instruct": {"input": 1.50 / 1e6, "output": 2.00 / 1e6},
154
- "o1": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
155
- "o1-preview": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
156
- "o1-2024-12-17": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
157
- "o3-mini": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
158
- "o3-mini-2025-01-31": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
159
- "o4-mini": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
160
- "o4-mini-2025-04-16": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
161
- "gpt-4.1": {
162
- "input": 2.00 / 1e6,
163
- "output": 8.00 / 1e6,
164
- },
165
- "gpt-4.1-mini": {
166
- "input": 0.4 / 1e6,
167
- "output": 1.60 / 1e6,
168
- },
169
- "gpt-4.1-nano": {
170
- "input": 0.1 / 1e6,
171
- "output": 0.4 / 1e6,
172
- },
173
- "gpt-4.5-preview": {
174
- "input": 75.00 / 1e6,
175
- "output": 150.00 / 1e6,
176
- },
177
- "gpt-5": {
178
- "input": 1.25 / 1e6,
179
- "output": 10.00 / 1e6,
180
- },
181
- "gpt-5-2025-08-07": {
182
- "input": 1.25 / 1e6,
183
- "output": 10.00 / 1e6,
184
- },
185
- "gpt-5-mini": {
186
- "input": 0.25 / 1e6,
187
- "output": 2.00 / 1e6,
188
- },
189
- "gpt-5-mini-2025-08-07": {
190
- "input": 0.25 / 1e6,
191
- "output": 2.00 / 1e6,
192
- },
193
- "gpt-5-nano": {
194
- "input": 0.05 / 1e6,
195
- "output": 0.40 / 1e6,
196
- },
197
- "gpt-5-nano-2025-08-07": {
198
- "input": 0.05 / 1e6,
199
- "output": 0.40 / 1e6,
200
- },
201
- "gpt-5-chat-latest": {
202
- "input": 1.25 / 1e6,
203
- "output": 10.00 / 1e6,
204
- },
205
- }
206
-
207
33
  default_gpt_model = "gpt-4.1"
208
34
 
209
- # Thinking models that require temperature=1
210
- models_requiring_temperature_1 = [
211
- "o1",
212
- "o1-2024-12-17",
213
- "o1-mini",
214
- "o1-mini-2024-09-12",
215
- "o3-mini",
216
- "o3-mini-2025-01-31",
217
- "o4-mini",
218
- "o4-mini-2025-04-16",
219
- "gpt-5",
220
- "gpt-5-2025-08-07",
221
- "gpt-5-mini",
222
- "gpt-5-mini-2025-08-07",
223
- "gpt-5-nano",
224
- "gpt-5-nano-2025-08-07",
225
- ]
226
-
227
35
 
228
36
  def _request_timeout_seconds() -> float:
229
37
  timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
@@ -236,25 +44,20 @@ _ALIAS_MAP = {
236
44
 
237
45
 
238
46
  class GPTModel(DeepEvalBaseLLM):
239
- valid_multimodal_models = [
240
- "gpt-4o",
241
- "gpt-4o-mini",
242
- "gpt-4.1",
243
- "gpt-4.1-mini",
244
- "gpt-5",
245
- ]
246
47
 
247
48
  def __init__(
248
49
  self,
249
50
  model: Optional[str] = None,
250
51
  api_key: Optional[str] = None,
251
52
  base_url: Optional[str] = None,
252
- temperature: float = 0,
53
+ temperature: Optional[float] = None,
253
54
  cost_per_input_token: Optional[float] = None,
254
55
  cost_per_output_token: Optional[float] = None,
255
56
  generation_kwargs: Optional[Dict] = None,
256
57
  **kwargs,
257
58
  ):
59
+ settings = get_settings()
60
+
258
61
  normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
259
62
  "GPTModel",
260
63
  kwargs,
@@ -265,8 +68,10 @@ class GPTModel(DeepEvalBaseLLM):
265
68
  if api_key is None and "api_key" in alias_values:
266
69
  api_key = alias_values["api_key"]
267
70
 
268
- settings = get_settings()
269
71
  model = model or settings.OPENAI_MODEL_NAME
72
+ if model is None:
73
+ model = default_gpt_model
74
+
270
75
  cost_per_input_token = (
271
76
  cost_per_input_token
272
77
  if cost_per_input_token is not None
@@ -278,71 +83,80 @@ class GPTModel(DeepEvalBaseLLM):
278
83
  else settings.OPENAI_COST_PER_OUTPUT_TOKEN
279
84
  )
280
85
 
281
- if model is None:
282
- model = default_gpt_model
283
-
284
- if isinstance(model, str):
285
- model = parse_model_name(model)
286
- if model not in valid_gpt_models:
287
- raise ValueError(
288
- f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}"
289
- )
290
-
291
- if model not in model_pricing:
292
- if cost_per_input_token is None or cost_per_output_token is None:
293
- raise ValueError(
294
- f"No pricing available for `{model}`. "
295
- "Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `GPTModel`, "
296
- "or set them via the CLI:\n"
297
- " deepeval set-openai --model=[...] --cost_per_input_token=[...] --cost_per_output_token=[...]"
298
- )
299
- else:
300
- model_pricing[model] = {
301
- "input": float(cost_per_input_token),
302
- "output": float(cost_per_output_token),
303
- }
304
-
305
86
  if api_key is not None:
306
87
  # keep it secret, keep it safe from serializings, logging and alike
307
- self.api_key: SecretStr | None = SecretStr(api_key)
88
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
308
89
  else:
309
- self.api_key = get_settings().OPENAI_API_KEY
90
+ self.api_key = settings.OPENAI_API_KEY
310
91
 
311
- self.base_url = base_url
92
+ self.base_url = (
93
+ str(base_url).rstrip("/") if base_url is not None else None
94
+ )
312
95
  # args and kwargs will be passed to the underlying model, in load_model function
313
96
 
314
- # Auto-adjust temperature for models that require it
315
- if model in models_requiring_temperature_1:
97
+ if temperature is not None:
98
+ temperature = float(temperature)
99
+ elif settings.TEMPERATURE is not None:
100
+ temperature = settings.TEMPERATURE
101
+ else:
102
+ temperature = 0.0
103
+
104
+ if isinstance(model, str):
105
+ model = parse_model_name(model)
106
+
107
+ self.model_data = OPENAI_MODELS_DATA.get(model)
108
+
109
+ # Auto-adjust temperature for known models that require it
110
+ if self.model_data.supports_temperature is False:
316
111
  temperature = 1
317
112
 
113
+ # validation
114
+ cost_per_input_token, cost_per_output_token = require_costs(
115
+ self.model_data,
116
+ model,
117
+ "OPENAI_COST_PER_INPUT_TOKEN",
118
+ "OPENAI_COST_PER_OUTPUT_TOKEN",
119
+ cost_per_input_token,
120
+ cost_per_output_token,
121
+ )
122
+ self.model_data.input_price = cost_per_input_token
123
+ self.model_data.output_price = cost_per_output_token
124
+
318
125
  if temperature < 0:
319
- raise ValueError("Temperature must be >= 0.")
126
+ raise DeepEvalError("Temperature must be >= 0.")
127
+
320
128
  self.temperature = temperature
321
129
  # Keep sanitized kwargs for client call to strip legacy keys
322
130
  self.kwargs = normalized_kwargs
323
- self.generation_kwargs = generation_kwargs or {}
131
+ self.kwargs.pop("temperature", None)
132
+
133
+ self.generation_kwargs = dict(generation_kwargs or {})
134
+ self.generation_kwargs.pop("temperature", None)
135
+
324
136
  super().__init__(model)
325
137
 
326
- ###############################################
327
- # Generate functions
328
- ###############################################
138
+ ######################
139
+ # Generate functions #
140
+ ######################
329
141
 
330
142
  @retry_openai
331
143
  def generate(
332
144
  self, prompt: str, schema: Optional[BaseModel] = None
333
- ) -> Tuple[Union[str, Dict], float]:
145
+ ) -> Tuple[Union[str, BaseModel], float]:
334
146
  client = self.load_model(async_mode=False)
335
147
 
336
148
  if check_if_multimodal(prompt):
337
149
  prompt = convert_to_multi_modal_array(input=prompt)
338
- prompt = self.generate_prompt(prompt)
150
+ content = self.generate_content(prompt)
151
+ else:
152
+ content = [{"type": "text", "text": prompt}]
339
153
 
340
154
  if schema:
341
- if self.name in structured_outputs_models:
155
+ if self.supports_structured_outputs() is True:
342
156
  completion = client.beta.chat.completions.parse(
343
157
  model=self.name,
344
158
  messages=[
345
- {"role": "user", "content": prompt},
159
+ {"role": "user", "content": content},
346
160
  ],
347
161
  response_format=schema,
348
162
  temperature=self.temperature,
@@ -356,11 +170,11 @@ class GPTModel(DeepEvalBaseLLM):
356
170
  completion.usage.completion_tokens,
357
171
  )
358
172
  return structured_output, cost
359
- if self.name in json_mode_models:
173
+ if self.supports_json_mode() is True:
360
174
  completion = client.beta.chat.completions.parse(
361
175
  model=self.name,
362
176
  messages=[
363
- {"role": "user", "content": prompt},
177
+ {"role": "user", "content": content},
364
178
  ],
365
179
  response_format={"type": "json_object"},
366
180
  temperature=self.temperature,
@@ -377,7 +191,7 @@ class GPTModel(DeepEvalBaseLLM):
377
191
 
378
192
  completion = client.chat.completions.create(
379
193
  model=self.name,
380
- messages=[{"role": "user", "content": prompt}],
194
+ messages=[{"role": "user", "content": content}],
381
195
  temperature=self.temperature,
382
196
  **self.generation_kwargs,
383
197
  )
@@ -399,14 +213,16 @@ class GPTModel(DeepEvalBaseLLM):
399
213
 
400
214
  if check_if_multimodal(prompt):
401
215
  prompt = convert_to_multi_modal_array(input=prompt)
402
- prompt = self.generate_prompt(prompt)
216
+ content = self.generate_content(prompt)
217
+ else:
218
+ content = [{"type": "text", "text": prompt}]
403
219
 
404
220
  if schema:
405
- if self.name in structured_outputs_models:
221
+ if self.supports_structured_outputs() is True:
406
222
  completion = await client.beta.chat.completions.parse(
407
223
  model=self.name,
408
224
  messages=[
409
- {"role": "user", "content": prompt},
225
+ {"role": "user", "content": content},
410
226
  ],
411
227
  response_format=schema,
412
228
  temperature=self.temperature,
@@ -420,11 +236,11 @@ class GPTModel(DeepEvalBaseLLM):
420
236
  completion.usage.completion_tokens,
421
237
  )
422
238
  return structured_output, cost
423
- if self.name in json_mode_models:
239
+ if self.supports_json_mode() is True:
424
240
  completion = await client.beta.chat.completions.parse(
425
241
  model=self.name,
426
242
  messages=[
427
- {"role": "user", "content": prompt},
243
+ {"role": "user", "content": content},
428
244
  ],
429
245
  response_format={"type": "json_object"},
430
246
  temperature=self.temperature,
@@ -441,7 +257,7 @@ class GPTModel(DeepEvalBaseLLM):
441
257
 
442
258
  completion = await client.chat.completions.create(
443
259
  model=self.name,
444
- messages=[{"role": "user", "content": prompt}],
260
+ messages=[{"role": "user", "content": content}],
445
261
  temperature=self.temperature,
446
262
  **self.generation_kwargs,
447
263
  )
@@ -455,9 +271,9 @@ class GPTModel(DeepEvalBaseLLM):
455
271
  else:
456
272
  return output, cost
457
273
 
458
- ###############################################
459
- # Other generate functions
460
- ###############################################
274
+ ############################
275
+ # Other generate functions #
276
+ ############################
461
277
 
462
278
  @retry_openai
463
279
  def generate_raw_response(
@@ -466,13 +282,26 @@ class GPTModel(DeepEvalBaseLLM):
466
282
  top_logprobs: int = 5,
467
283
  ) -> Tuple[ChatCompletion, float]:
468
284
  # Generate completion
285
+ model_name = self.name
286
+ is_multimodal = check_if_multimodal(prompt)
287
+
288
+ # validate that this model supports logprobs
289
+ if self.supports_log_probs() is False:
290
+ raise DeepEvalError(
291
+ f"Model `{model_name}` does not support `logprobs` / `top_logprobs`. "
292
+ "Please use a different OpenAI model (for example `gpt-4.1` or `gpt-4o`) "
293
+ "when calling `generate_raw_response`."
294
+ )
295
+
469
296
  client = self.load_model(async_mode=False)
470
- if check_if_multimodal(prompt):
297
+ if is_multimodal:
471
298
  prompt = convert_to_multi_modal_array(input=prompt)
472
- prompt = self.generate_prompt(prompt)
299
+ content = self.generate_content(prompt)
300
+ else:
301
+ content = [{"type": "text", "text": prompt}]
473
302
  completion = client.chat.completions.create(
474
303
  model=self.name,
475
- messages=[{"role": "user", "content": prompt}],
304
+ messages=[{"role": "user", "content": content}],
476
305
  temperature=self.temperature,
477
306
  logprobs=True,
478
307
  top_logprobs=top_logprobs,
@@ -492,13 +321,26 @@ class GPTModel(DeepEvalBaseLLM):
492
321
  top_logprobs: int = 5,
493
322
  ) -> Tuple[ChatCompletion, float]:
494
323
  # Generate completion
324
+ model_name = self.name
325
+ is_multimodal = check_if_multimodal(prompt)
326
+
327
+ # validate that this model supports logprobs
328
+ if self.supports_log_probs() is False:
329
+ raise DeepEvalError(
330
+ f"Model `{model_name}` does not support `logprobs` / `top_logprobs`. "
331
+ "Please use a different OpenAI model (for example `gpt-4.1` or `gpt-4o`) "
332
+ "when calling `a_generate_raw_response`."
333
+ )
334
+
495
335
  client = self.load_model(async_mode=True)
496
- if check_if_multimodal(prompt):
336
+ if is_multimodal:
497
337
  prompt = convert_to_multi_modal_array(input=prompt)
498
- prompt = self.generate_prompt(prompt)
338
+ content = self.generate_content(prompt)
339
+ else:
340
+ content = [{"type": "text", "text": prompt}]
499
341
  completion = await client.chat.completions.create(
500
342
  model=self.name,
501
- messages=[{"role": "user", "content": prompt}],
343
+ messages=[{"role": "user", "content": content}],
502
344
  temperature=self.temperature,
503
345
  logprobs=True,
504
346
  top_logprobs=top_logprobs,
@@ -514,14 +356,16 @@ class GPTModel(DeepEvalBaseLLM):
514
356
  @retry_openai
515
357
  def generate_samples(
516
358
  self, prompt: str, n: int, temperature: float
517
- ) -> Tuple[list[str], float]:
359
+ ) -> list[str]:
518
360
  client = self.load_model(async_mode=False)
519
361
  if check_if_multimodal(prompt):
520
362
  prompt = convert_to_multi_modal_array(input=prompt)
521
- prompt = self.generate_prompt(prompt)
363
+ content = self.generate_content(prompt)
364
+ else:
365
+ content = [{"type": "text", "text": prompt}]
522
366
  response = client.chat.completions.create(
523
367
  model=self.name,
524
- messages=[{"role": "user", "content": prompt}],
368
+ messages=[{"role": "user", "content": content}],
525
369
  n=n,
526
370
  temperature=temperature,
527
371
  **self.generation_kwargs,
@@ -529,55 +373,74 @@ class GPTModel(DeepEvalBaseLLM):
529
373
  completions = [choice.message.content for choice in response.choices]
530
374
  return completions
531
375
 
532
- ###############################################
533
- # Utilities
534
- ###############################################
376
+ #############
377
+ # Utilities #
378
+ #############
535
379
 
536
380
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
537
- # TODO: consider loggin a warning instead of defaulting to whole model pricing
538
- pricing = model_pricing.get(self.name, model_pricing)
539
- input_cost = input_tokens * pricing["input"]
540
- output_cost = output_tokens * pricing["output"]
541
- return input_cost + output_cost
381
+ if self.model_data.input_price and self.model_data.output_price:
382
+ input_cost = input_tokens * self.model_data.input_price
383
+ output_cost = output_tokens * self.model_data.output_price
384
+ return input_cost + output_cost
385
+
386
+ #########################
387
+ # Capabilities #
388
+ #########################
389
+
390
+ def supports_log_probs(self) -> Union[bool, None]:
391
+ return self.model_data.supports_log_probs
392
+
393
+ def supports_temperature(self) -> Union[bool, None]:
394
+ return self.model_data.supports_temperature
395
+
396
+ def supports_multimodal(self) -> Union[bool, None]:
397
+ return self.model_data.supports_multimodal
398
+
399
+ def supports_structured_outputs(self) -> Union[bool, None]:
400
+ """
401
+ OpenAI models that natively enforce typed structured outputs.
402
+ Used by generate(...) when a schema is provided.
403
+ """
404
+ return self.model_data.supports_structured_outputs
405
+
406
+ def supports_json_mode(self) -> Union[bool, None]:
407
+ """
408
+ OpenAI models that enforce JSON mode
409
+ """
410
+ return self.model_data.supports_json
542
411
 
543
412
  #########
544
413
  # Model #
545
414
  #########
546
415
 
547
- def generate_prompt(
548
- self, multimodal_input: List[Union[str, MLLMImage]] = []
416
+ def generate_content(
417
+ self, multimodal_input: Optional[List[Union[str, MLLMImage]]] = None
549
418
  ):
550
- prompt = []
551
- for ele in multimodal_input:
552
- if isinstance(ele, str):
553
- prompt.append({"type": "text", "text": ele})
554
- elif isinstance(ele, MLLMImage):
555
- if ele.local:
556
- import PIL.Image
557
-
558
- image = PIL.Image.open(ele.url)
559
- visual_dict = {
560
- "type": "image_url",
561
- "image_url": {
562
- "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
563
- },
564
- }
419
+ multimodal_input = [] if multimodal_input is None else multimodal_input
420
+ content = []
421
+ for element in multimodal_input:
422
+ if isinstance(element, str):
423
+ content.append({"type": "text", "text": element})
424
+ elif isinstance(element, MLLMImage):
425
+ if element.url and not element.local:
426
+ content.append(
427
+ {
428
+ "type": "image_url",
429
+ "image_url": {"url": element.url},
430
+ }
431
+ )
565
432
  else:
566
- visual_dict = {
567
- "type": "image_url",
568
- "image_url": {"url": ele.url},
569
- }
570
- prompt.append(visual_dict)
571
- return prompt
572
-
573
- def encode_pil_image(self, pil_image):
574
- image_buffer = BytesIO()
575
- if pil_image.mode in ("RGBA", "LA", "P"):
576
- pil_image = pil_image.convert("RGB")
577
- pil_image.save(image_buffer, format="JPEG")
578
- image_bytes = image_buffer.getvalue()
579
- base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
580
- return base64_encoded_image
433
+ element.ensure_images_loaded()
434
+ data_uri = (
435
+ f"data:{element.mimeType};base64,{element.dataBase64}"
436
+ )
437
+ content.append(
438
+ {
439
+ "type": "image_url",
440
+ "image_url": {"url": data_uri},
441
+ }
442
+ )
443
+ return content
581
444
 
582
445
  def load_model(self, async_mode: bool = False):
583
446
  if not async_mode:
@@ -620,10 +483,5 @@ class GPTModel(DeepEvalBaseLLM):
620
483
  return cls(**kw)
621
484
  raise
622
485
 
623
- def supports_multimodal(self):
624
- if self.name in GPTModel.valid_multimodal_models:
625
- return True
626
- return False
627
-
628
486
  def get_model_name(self):
629
487
  return f"{self.name}"