deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,234 +1,77 @@
1
1
  from openai.types.chat.chat_completion import ChatCompletion
2
- from typing import Optional, Tuple, Union, Dict
2
+ from typing import Optional, Tuple, Union, Dict, List
3
+ from deepeval.test_case import MLLMImage
3
4
  from pydantic import BaseModel, SecretStr
4
-
5
5
  from openai import (
6
6
  OpenAI,
7
7
  AsyncOpenAI,
8
8
  )
9
9
 
10
+ from deepeval.errors import DeepEvalError
11
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
10
12
  from deepeval.config.settings import get_settings
11
13
  from deepeval.constants import ProviderSlug as PS
12
14
  from deepeval.models import DeepEvalBaseLLM
13
15
  from deepeval.models.llms.utils import trim_and_load_json
14
- from deepeval.models.utils import parse_model_name, require_secret_api_key
16
+ from deepeval.models.utils import (
17
+ parse_model_name,
18
+ require_costs,
19
+ require_secret_api_key,
20
+ normalize_kwargs_and_extract_aliases,
21
+ )
15
22
  from deepeval.models.retry_policy import (
16
23
  create_retry_decorator,
17
24
  sdk_retries_for,
18
25
  )
26
+ from deepeval.models.llms.constants import (
27
+ OPENAI_MODELS_DATA,
28
+ )
19
29
 
20
30
 
21
31
  retry_openai = create_retry_decorator(PS.OPENAI)
22
32
 
23
- valid_gpt_models = [
24
- "gpt-3.5-turbo",
25
- "gpt-3.5-turbo-0125",
26
- "gpt-3.5-turbo-1106",
27
- "gpt-4-0125-preview",
28
- "gpt-4-1106-preview",
29
- "gpt-4-turbo",
30
- "gpt-4-turbo-2024-04-09",
31
- "gpt-4-turbo-preview",
32
- "gpt-4o",
33
- "gpt-4o-2024-05-13",
34
- "gpt-4o-2024-08-06",
35
- "gpt-4o-2024-11-20",
36
- "gpt-4o-mini",
37
- "gpt-4o-mini-2024-07-18",
38
- "gpt-4-32k",
39
- "gpt-4-32k-0613",
40
- "gpt-4.1",
41
- "gpt-4.1-mini",
42
- "gpt-4.1-nano",
43
- "gpt-4.5-preview",
44
- "o1",
45
- "o1-preview",
46
- "o1-2024-12-17",
47
- "o1-preview-2024-09-12",
48
- "o1-mini",
49
- "o1-mini-2024-09-12",
50
- "o3-mini",
51
- "o3-mini-2025-01-31",
52
- "o4-mini",
53
- "o4-mini-2025-04-16",
54
- "gpt-4.5-preview-2025-02-27",
55
- "gpt-5",
56
- "gpt-5-2025-08-07",
57
- "gpt-5-mini",
58
- "gpt-5-mini-2025-08-07",
59
- "gpt-5-nano",
60
- "gpt-5-nano-2025-08-07",
61
- "gpt-5-chat-latest",
62
- ]
63
-
64
- unsupported_log_probs_gpt_models = [
65
- "o1",
66
- "o1-preview",
67
- "o1-2024-12-17",
68
- "o1-preview-2024-09-12",
69
- "o1-mini",
70
- "o1-mini-2024-09-12",
71
- "o3-mini",
72
- "o3-mini-2025-01-31",
73
- "o4-mini",
74
- "o4-mini-2025-04-16",
75
- "gpt-4.5-preview-2025-02-27",
76
- "gpt-5",
77
- "gpt-5-2025-08-07",
78
- "gpt-5-mini",
79
- "gpt-5-mini-2025-08-07",
80
- "gpt-5-nano",
81
- "gpt-5-nano-2025-08-07",
82
- "gpt-5-chat-latest",
83
- ]
84
-
85
- structured_outputs_models = [
86
- "gpt-4o",
87
- "gpt-4o-2024-05-13",
88
- "gpt-4o-2024-08-06",
89
- "gpt-4o-2024-11-20",
90
- "gpt-4o-mini",
91
- "gpt-4o-mini-2024-07-18",
92
- "gpt-4.1",
93
- "gpt-4.1-mini",
94
- "gpt-4.1-nano",
95
- "o1",
96
- "o1-preview",
97
- "o1-2024-12-17",
98
- "o3-mini",
99
- "o3-mini-2025-01-31",
100
- "o4-mini",
101
- "o4-mini-2025-04-16",
102
- "gpt-4.5-preview-2025-02-27",
103
- "gpt-5",
104
- "gpt-5-2025-08-07",
105
- "gpt-5-mini",
106
- "gpt-5-mini-2025-08-07",
107
- "gpt-5-nano",
108
- "gpt-5-nano-2025-08-07",
109
- ]
110
-
111
- json_mode_models = [
112
- "gpt-3.5-turbo",
113
- "gpt-3.5-turbo-0125",
114
- "gpt-3.5-turbo-1106",
115
- "gpt-4-0125-preview",
116
- "gpt-4-1106-preview",
117
- "gpt-4-turbo",
118
- "gpt-4-turbo-2024-04-09",
119
- "gpt-4-turbo-preview",
120
- "gpt-4-32k",
121
- "gpt-4-32k-0613",
122
- ]
123
-
124
- model_pricing = {
125
- "gpt-4o-mini": {"input": 0.150 / 1e6, "output": 0.600 / 1e6},
126
- "gpt-4o": {"input": 2.50 / 1e6, "output": 10.00 / 1e6},
127
- "gpt-4-turbo": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
128
- "gpt-4-turbo-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
129
- "gpt-4-0125-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
130
- "gpt-4-1106-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
131
- "gpt-4": {"input": 30.00 / 1e6, "output": 60.00 / 1e6},
132
- "gpt-4-32k": {"input": 60.00 / 1e6, "output": 120.00 / 1e6},
133
- "gpt-3.5-turbo-1106": {"input": 1.00 / 1e6, "output": 2.00 / 1e6},
134
- "gpt-3.5-turbo": {"input": 0.50 / 1e6, "output": 1.50 / 1e6},
135
- "gpt-3.5-turbo-16k": {"input": 3.00 / 1e6, "output": 4.00 / 1e6},
136
- "gpt-3.5-turbo-0125": {"input": 0.50 / 1e6, "output": 1.50 / 1e6},
137
- "gpt-3.5-turbo-instruct": {"input": 1.50 / 1e6, "output": 2.00 / 1e6},
138
- "o1": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
139
- "o1-preview": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
140
- "o1-2024-12-17": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
141
- "o3-mini": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
142
- "o3-mini-2025-01-31": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
143
- "o4-mini": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
144
- "o4-mini-2025-04-16": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
145
- "gpt-4.1": {
146
- "input": 2.00 / 1e6,
147
- "output": 8.00 / 1e6,
148
- },
149
- "gpt-4.1-mini": {
150
- "input": 0.4 / 1e6,
151
- "output": 1.60 / 1e6,
152
- },
153
- "gpt-4.1-nano": {
154
- "input": 0.1 / 1e6,
155
- "output": 0.4 / 1e6,
156
- },
157
- "gpt-4.5-preview": {
158
- "input": 75.00 / 1e6,
159
- "output": 150.00 / 1e6,
160
- },
161
- "gpt-5": {
162
- "input": 1.25 / 1e6,
163
- "output": 10.00 / 1e6,
164
- },
165
- "gpt-5-2025-08-07": {
166
- "input": 1.25 / 1e6,
167
- "output": 10.00 / 1e6,
168
- },
169
- "gpt-5-mini": {
170
- "input": 0.25 / 1e6,
171
- "output": 2.00 / 1e6,
172
- },
173
- "gpt-5-mini-2025-08-07": {
174
- "input": 0.25 / 1e6,
175
- "output": 2.00 / 1e6,
176
- },
177
- "gpt-5-nano": {
178
- "input": 0.05 / 1e6,
179
- "output": 0.40 / 1e6,
180
- },
181
- "gpt-5-nano-2025-08-07": {
182
- "input": 0.05 / 1e6,
183
- "output": 0.40 / 1e6,
184
- },
185
- "gpt-5-chat-latest": {
186
- "input": 1.25 / 1e6,
187
- "output": 10.00 / 1e6,
188
- },
189
- }
190
-
191
33
  default_gpt_model = "gpt-4.1"
192
34
 
193
- # Thinking models that require temperature=1
194
- models_requiring_temperature_1 = [
195
- "o1",
196
- "o1-2024-12-17",
197
- "o1-mini",
198
- "o1-mini-2024-09-12",
199
- "o3-mini",
200
- "o3-mini-2025-01-31",
201
- "o4-mini",
202
- "o4-mini-2025-04-16",
203
- "gpt-5",
204
- "gpt-5-2025-08-07",
205
- "gpt-5-mini",
206
- "gpt-5-mini-2025-08-07",
207
- "gpt-5-nano",
208
- "gpt-5-nano-2025-08-07",
209
- ]
210
-
211
35
 
212
36
  def _request_timeout_seconds() -> float:
213
37
  timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
214
38
  return timeout if timeout > 0 else 30.0
215
39
 
216
40
 
41
+ _ALIAS_MAP = {
42
+ "api_key": ["_openai_api_key"],
43
+ }
44
+
45
+
217
46
  class GPTModel(DeepEvalBaseLLM):
47
+
218
48
  def __init__(
219
49
  self,
220
50
  model: Optional[str] = None,
221
- _openai_api_key: Optional[str] = None,
51
+ api_key: Optional[str] = None,
222
52
  base_url: Optional[str] = None,
53
+ temperature: Optional[float] = None,
223
54
  cost_per_input_token: Optional[float] = None,
224
55
  cost_per_output_token: Optional[float] = None,
225
- temperature: float = 0,
226
56
  generation_kwargs: Optional[Dict] = None,
227
57
  **kwargs,
228
58
  ):
229
59
  settings = get_settings()
230
- model_name = None
60
+
61
+ normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
62
+ "GPTModel",
63
+ kwargs,
64
+ _ALIAS_MAP,
65
+ )
66
+
67
+ # re-map depricated keywords to re-named positional args
68
+ if api_key is None and "api_key" in alias_values:
69
+ api_key = alias_values["api_key"]
70
+
231
71
  model = model or settings.OPENAI_MODEL_NAME
72
+ if model is None:
73
+ model = default_gpt_model
74
+
232
75
  cost_per_input_token = (
233
76
  cost_per_input_token
234
77
  if cost_per_input_token is not None
@@ -240,67 +83,80 @@ class GPTModel(DeepEvalBaseLLM):
240
83
  else settings.OPENAI_COST_PER_OUTPUT_TOKEN
241
84
  )
242
85
 
243
- if isinstance(model, str):
244
- model_name = parse_model_name(model)
245
- if model_name not in valid_gpt_models:
246
- raise ValueError(
247
- f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}"
248
- )
249
- elif model is None:
250
- model_name = default_gpt_model
251
-
252
- if model_name not in model_pricing:
253
- if cost_per_input_token is None or cost_per_output_token is None:
254
- raise ValueError(
255
- f"No pricing available for `{model_name}`. "
256
- "Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `GPTModel`, "
257
- "or set them via the CLI:\n"
258
- " deepeval set-openai --model=[...] --cost_per_input_token=[...] --cost_per_output_token=[...]"
259
- )
260
- else:
261
- model_pricing[model_name] = {
262
- "input": float(cost_per_input_token),
263
- "output": float(cost_per_output_token),
264
- }
265
-
266
- elif model is None:
267
- model_name = default_gpt_model
268
-
269
- if _openai_api_key is not None:
86
+ if api_key is not None:
270
87
  # keep it secret, keep it safe from serializings, logging and alike
271
- self._openai_api_key: SecretStr | None = SecretStr(_openai_api_key)
88
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
272
89
  else:
273
- self._openai_api_key = get_settings().OPENAI_API_KEY
90
+ self.api_key = settings.OPENAI_API_KEY
274
91
 
275
- self.base_url = base_url
92
+ self.base_url = (
93
+ str(base_url).rstrip("/") if base_url is not None else None
94
+ )
276
95
  # args and kwargs will be passed to the underlying model, in load_model function
277
96
 
278
- # Auto-adjust temperature for models that require it
279
- if model_name in models_requiring_temperature_1:
97
+ if temperature is not None:
98
+ temperature = float(temperature)
99
+ elif settings.TEMPERATURE is not None:
100
+ temperature = settings.TEMPERATURE
101
+ else:
102
+ temperature = 0.0
103
+
104
+ if isinstance(model, str):
105
+ model = parse_model_name(model)
106
+
107
+ self.model_data = OPENAI_MODELS_DATA.get(model)
108
+
109
+ # Auto-adjust temperature for known models that require it
110
+ if self.model_data.supports_temperature is False:
280
111
  temperature = 1
281
112
 
113
+ # validation
114
+ cost_per_input_token, cost_per_output_token = require_costs(
115
+ self.model_data,
116
+ model,
117
+ "OPENAI_COST_PER_INPUT_TOKEN",
118
+ "OPENAI_COST_PER_OUTPUT_TOKEN",
119
+ cost_per_input_token,
120
+ cost_per_output_token,
121
+ )
122
+ self.model_data.input_price = cost_per_input_token
123
+ self.model_data.output_price = cost_per_output_token
124
+
282
125
  if temperature < 0:
283
- raise ValueError("Temperature must be >= 0.")
126
+ raise DeepEvalError("Temperature must be >= 0.")
127
+
284
128
  self.temperature = temperature
285
- self.kwargs = kwargs
286
- self.generation_kwargs = generation_kwargs or {}
287
- super().__init__(model_name)
129
+ # Keep sanitized kwargs for client call to strip legacy keys
130
+ self.kwargs = normalized_kwargs
131
+ self.kwargs.pop("temperature", None)
132
+
133
+ self.generation_kwargs = dict(generation_kwargs or {})
134
+ self.generation_kwargs.pop("temperature", None)
288
135
 
289
- ###############################################
290
- # Generate functions
291
- ###############################################
136
+ super().__init__(model)
137
+
138
+ ######################
139
+ # Generate functions #
140
+ ######################
292
141
 
293
142
  @retry_openai
294
143
  def generate(
295
144
  self, prompt: str, schema: Optional[BaseModel] = None
296
- ) -> Tuple[Union[str, Dict], float]:
145
+ ) -> Tuple[Union[str, BaseModel], float]:
297
146
  client = self.load_model(async_mode=False)
147
+
148
+ if check_if_multimodal(prompt):
149
+ prompt = convert_to_multi_modal_array(input=prompt)
150
+ content = self.generate_content(prompt)
151
+ else:
152
+ content = [{"type": "text", "text": prompt}]
153
+
298
154
  if schema:
299
- if self.model_name in structured_outputs_models:
155
+ if self.supports_structured_outputs() is True:
300
156
  completion = client.beta.chat.completions.parse(
301
- model=self.model_name,
157
+ model=self.name,
302
158
  messages=[
303
- {"role": "user", "content": prompt},
159
+ {"role": "user", "content": content},
304
160
  ],
305
161
  response_format=schema,
306
162
  temperature=self.temperature,
@@ -314,11 +170,11 @@ class GPTModel(DeepEvalBaseLLM):
314
170
  completion.usage.completion_tokens,
315
171
  )
316
172
  return structured_output, cost
317
- if self.model_name in json_mode_models:
173
+ if self.supports_json_mode() is True:
318
174
  completion = client.beta.chat.completions.parse(
319
- model=self.model_name,
175
+ model=self.name,
320
176
  messages=[
321
- {"role": "user", "content": prompt},
177
+ {"role": "user", "content": content},
322
178
  ],
323
179
  response_format={"type": "json_object"},
324
180
  temperature=self.temperature,
@@ -334,8 +190,8 @@ class GPTModel(DeepEvalBaseLLM):
334
190
  return schema.model_validate(json_output), cost
335
191
 
336
192
  completion = client.chat.completions.create(
337
- model=self.model_name,
338
- messages=[{"role": "user", "content": prompt}],
193
+ model=self.name,
194
+ messages=[{"role": "user", "content": content}],
339
195
  temperature=self.temperature,
340
196
  **self.generation_kwargs,
341
197
  )
@@ -354,12 +210,19 @@ class GPTModel(DeepEvalBaseLLM):
354
210
  self, prompt: str, schema: Optional[BaseModel] = None
355
211
  ) -> Tuple[Union[str, BaseModel], float]:
356
212
  client = self.load_model(async_mode=True)
213
+
214
+ if check_if_multimodal(prompt):
215
+ prompt = convert_to_multi_modal_array(input=prompt)
216
+ content = self.generate_content(prompt)
217
+ else:
218
+ content = [{"type": "text", "text": prompt}]
219
+
357
220
  if schema:
358
- if self.model_name in structured_outputs_models:
221
+ if self.supports_structured_outputs() is True:
359
222
  completion = await client.beta.chat.completions.parse(
360
- model=self.model_name,
223
+ model=self.name,
361
224
  messages=[
362
- {"role": "user", "content": prompt},
225
+ {"role": "user", "content": content},
363
226
  ],
364
227
  response_format=schema,
365
228
  temperature=self.temperature,
@@ -373,11 +236,11 @@ class GPTModel(DeepEvalBaseLLM):
373
236
  completion.usage.completion_tokens,
374
237
  )
375
238
  return structured_output, cost
376
- if self.model_name in json_mode_models:
239
+ if self.supports_json_mode() is True:
377
240
  completion = await client.beta.chat.completions.parse(
378
- model=self.model_name,
241
+ model=self.name,
379
242
  messages=[
380
- {"role": "user", "content": prompt},
243
+ {"role": "user", "content": content},
381
244
  ],
382
245
  response_format={"type": "json_object"},
383
246
  temperature=self.temperature,
@@ -393,8 +256,8 @@ class GPTModel(DeepEvalBaseLLM):
393
256
  return schema.model_validate(json_output), cost
394
257
 
395
258
  completion = await client.chat.completions.create(
396
- model=self.model_name,
397
- messages=[{"role": "user", "content": prompt}],
259
+ model=self.name,
260
+ messages=[{"role": "user", "content": content}],
398
261
  temperature=self.temperature,
399
262
  **self.generation_kwargs,
400
263
  )
@@ -408,9 +271,9 @@ class GPTModel(DeepEvalBaseLLM):
408
271
  else:
409
272
  return output, cost
410
273
 
411
- ###############################################
412
- # Other generate functions
413
- ###############################################
274
+ ############################
275
+ # Other generate functions #
276
+ ############################
414
277
 
415
278
  @retry_openai
416
279
  def generate_raw_response(
@@ -419,10 +282,26 @@ class GPTModel(DeepEvalBaseLLM):
419
282
  top_logprobs: int = 5,
420
283
  ) -> Tuple[ChatCompletion, float]:
421
284
  # Generate completion
285
+ model_name = self.name
286
+ is_multimodal = check_if_multimodal(prompt)
287
+
288
+ # validate that this model supports logprobs
289
+ if self.supports_log_probs() is False:
290
+ raise DeepEvalError(
291
+ f"Model `{model_name}` does not support `logprobs` / `top_logprobs`. "
292
+ "Please use a different OpenAI model (for example `gpt-4.1` or `gpt-4o`) "
293
+ "when calling `generate_raw_response`."
294
+ )
295
+
422
296
  client = self.load_model(async_mode=False)
297
+ if is_multimodal:
298
+ prompt = convert_to_multi_modal_array(input=prompt)
299
+ content = self.generate_content(prompt)
300
+ else:
301
+ content = [{"type": "text", "text": prompt}]
423
302
  completion = client.chat.completions.create(
424
- model=self.model_name,
425
- messages=[{"role": "user", "content": prompt}],
303
+ model=self.name,
304
+ messages=[{"role": "user", "content": content}],
426
305
  temperature=self.temperature,
427
306
  logprobs=True,
428
307
  top_logprobs=top_logprobs,
@@ -442,10 +321,26 @@ class GPTModel(DeepEvalBaseLLM):
442
321
  top_logprobs: int = 5,
443
322
  ) -> Tuple[ChatCompletion, float]:
444
323
  # Generate completion
324
+ model_name = self.name
325
+ is_multimodal = check_if_multimodal(prompt)
326
+
327
+ # validate that this model supports logprobs
328
+ if self.supports_log_probs() is False:
329
+ raise DeepEvalError(
330
+ f"Model `{model_name}` does not support `logprobs` / `top_logprobs`. "
331
+ "Please use a different OpenAI model (for example `gpt-4.1` or `gpt-4o`) "
332
+ "when calling `a_generate_raw_response`."
333
+ )
334
+
445
335
  client = self.load_model(async_mode=True)
336
+ if is_multimodal:
337
+ prompt = convert_to_multi_modal_array(input=prompt)
338
+ content = self.generate_content(prompt)
339
+ else:
340
+ content = [{"type": "text", "text": prompt}]
446
341
  completion = await client.chat.completions.create(
447
- model=self.model_name,
448
- messages=[{"role": "user", "content": prompt}],
342
+ model=self.name,
343
+ messages=[{"role": "user", "content": content}],
449
344
  temperature=self.temperature,
450
345
  logprobs=True,
451
346
  top_logprobs=top_logprobs,
@@ -461,11 +356,16 @@ class GPTModel(DeepEvalBaseLLM):
461
356
  @retry_openai
462
357
  def generate_samples(
463
358
  self, prompt: str, n: int, temperature: float
464
- ) -> Tuple[list[str], float]:
359
+ ) -> list[str]:
465
360
  client = self.load_model(async_mode=False)
361
+ if check_if_multimodal(prompt):
362
+ prompt = convert_to_multi_modal_array(input=prompt)
363
+ content = self.generate_content(prompt)
364
+ else:
365
+ content = [{"type": "text", "text": prompt}]
466
366
  response = client.chat.completions.create(
467
- model=self.model_name,
468
- messages=[{"role": "user", "content": prompt}],
367
+ model=self.name,
368
+ messages=[{"role": "user", "content": content}],
469
369
  n=n,
470
370
  temperature=temperature,
471
371
  **self.generation_kwargs,
@@ -473,23 +373,73 @@ class GPTModel(DeepEvalBaseLLM):
473
373
  completions = [choice.message.content for choice in response.choices]
474
374
  return completions
475
375
 
476
- ###############################################
477
- # Utilities
478
- ###############################################
376
+ #############
377
+ # Utilities #
378
+ #############
479
379
 
480
380
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
481
- # TODO: consider loggin a warning instead of defaulting to whole model pricing
482
- pricing = model_pricing.get(self.model_name, model_pricing)
483
- input_cost = input_tokens * pricing["input"]
484
- output_cost = output_tokens * pricing["output"]
381
+ input_cost = input_tokens * self.model_data.input_price
382
+ output_cost = output_tokens * self.model_data.output_price
485
383
  return input_cost + output_cost
486
384
 
385
+ #########################
386
+ # Capabilities #
387
+ #########################
388
+
389
+ def supports_log_probs(self) -> Union[bool, None]:
390
+ return self.model_data.supports_log_probs
391
+
392
+ def supports_temperature(self) -> Union[bool, None]:
393
+ return self.model_data.supports_temperature
394
+
395
+ def supports_multimodal(self) -> Union[bool, None]:
396
+ return self.model_data.supports_multimodal
397
+
398
+ def supports_structured_outputs(self) -> Union[bool, None]:
399
+ """
400
+ OpenAI models that natively enforce typed structured outputs.
401
+ Used by generate(...) when a schema is provided.
402
+ """
403
+ return self.model_data.supports_structured_outputs
404
+
405
+ def supports_json_mode(self) -> Union[bool, None]:
406
+ """
407
+ OpenAI models that enforce JSON mode
408
+ """
409
+ return self.model_data.supports_json
410
+
487
411
  #########
488
412
  # Model #
489
413
  #########
490
414
 
491
- def get_model_name(self):
492
- return self.model_name
415
+ def generate_content(
416
+ self, multimodal_input: Optional[List[Union[str, MLLMImage]]] = None
417
+ ):
418
+ multimodal_input = [] if multimodal_input is None else multimodal_input
419
+ content = []
420
+ for element in multimodal_input:
421
+ if isinstance(element, str):
422
+ content.append({"type": "text", "text": element})
423
+ elif isinstance(element, MLLMImage):
424
+ if element.url and not element.local:
425
+ content.append(
426
+ {
427
+ "type": "image_url",
428
+ "image_url": {"url": element.url},
429
+ }
430
+ )
431
+ else:
432
+ element.ensure_images_loaded()
433
+ data_uri = (
434
+ f"data:{element.mimeType};base64,{element.dataBase64}"
435
+ )
436
+ content.append(
437
+ {
438
+ "type": "image_url",
439
+ "image_url": {"url": data_uri},
440
+ }
441
+ )
442
+ return content
493
443
 
494
444
  def load_model(self, async_mode: bool = False):
495
445
  if not async_mode:
@@ -512,10 +462,10 @@ class GPTModel(DeepEvalBaseLLM):
512
462
 
513
463
  def _build_client(self, cls):
514
464
  api_key = require_secret_api_key(
515
- self._openai_api_key,
465
+ self.api_key,
516
466
  provider_label="OpenAI",
517
467
  env_var_name="OPENAI_API_KEY",
518
- param_hint="`_openai_api_key` to GPTModel(...)",
468
+ param_hint="`api_key` to GPTModel(...)",
519
469
  )
520
470
 
521
471
  kw = dict(
@@ -531,3 +481,6 @@ class GPTModel(DeepEvalBaseLLM):
531
481
  kw.pop("max_retries", None)
532
482
  return cls(**kw)
533
483
  raise
484
+
485
+ def get_model_name(self):
486
+ return f"{self.name}"