deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import base64
|
|
2
1
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
3
2
|
from typing import Optional, Tuple, Union, Dict, List
|
|
4
3
|
from deepeval.test_case import MLLMImage
|
|
5
4
|
from pydantic import BaseModel, SecretStr
|
|
6
|
-
from io import BytesIO
|
|
7
5
|
from openai import (
|
|
8
6
|
OpenAI,
|
|
9
7
|
AsyncOpenAI,
|
|
10
8
|
)
|
|
9
|
+
|
|
10
|
+
from deepeval.errors import DeepEvalError
|
|
11
11
|
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
12
12
|
from deepeval.config.settings import get_settings
|
|
13
13
|
from deepeval.constants import ProviderSlug as PS
|
|
@@ -15,6 +15,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
15
15
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
16
16
|
from deepeval.models.utils import (
|
|
17
17
|
parse_model_name,
|
|
18
|
+
require_costs,
|
|
18
19
|
require_secret_api_key,
|
|
19
20
|
normalize_kwargs_and_extract_aliases,
|
|
20
21
|
)
|
|
@@ -22,208 +23,15 @@ from deepeval.models.retry_policy import (
|
|
|
22
23
|
create_retry_decorator,
|
|
23
24
|
sdk_retries_for,
|
|
24
25
|
)
|
|
26
|
+
from deepeval.models.llms.constants import (
|
|
27
|
+
OPENAI_MODELS_DATA,
|
|
28
|
+
)
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
retry_openai = create_retry_decorator(PS.OPENAI)
|
|
28
32
|
|
|
29
|
-
|
|
30
|
-
valid_gpt_models = [
|
|
31
|
-
"gpt-3.5-turbo",
|
|
32
|
-
"gpt-3.5-turbo-0125",
|
|
33
|
-
"gpt-3.5-turbo-1106",
|
|
34
|
-
"gpt-4-0125-preview",
|
|
35
|
-
"gpt-4-1106-preview",
|
|
36
|
-
"gpt-4-turbo",
|
|
37
|
-
"gpt-4-turbo-2024-04-09",
|
|
38
|
-
"gpt-4-turbo-preview",
|
|
39
|
-
"gpt-4o",
|
|
40
|
-
"gpt-4o-2024-05-13",
|
|
41
|
-
"gpt-4o-2024-08-06",
|
|
42
|
-
"gpt-4o-2024-11-20",
|
|
43
|
-
"gpt-4o-mini",
|
|
44
|
-
"gpt-4o-mini-2024-07-18",
|
|
45
|
-
"gpt-4-32k",
|
|
46
|
-
"gpt-4-32k-0613",
|
|
47
|
-
"gpt-4.1",
|
|
48
|
-
"gpt-4.1-mini",
|
|
49
|
-
"gpt-4.1-nano",
|
|
50
|
-
"gpt-4.5-preview",
|
|
51
|
-
"o1",
|
|
52
|
-
"o1-preview",
|
|
53
|
-
"o1-2024-12-17",
|
|
54
|
-
"o1-preview-2024-09-12",
|
|
55
|
-
"o1-mini",
|
|
56
|
-
"o1-mini-2024-09-12",
|
|
57
|
-
"o3-mini",
|
|
58
|
-
"o3-mini-2025-01-31",
|
|
59
|
-
"o4-mini",
|
|
60
|
-
"o4-mini-2025-04-16",
|
|
61
|
-
"gpt-4.5-preview-2025-02-27",
|
|
62
|
-
"gpt-5",
|
|
63
|
-
"gpt-5-2025-08-07",
|
|
64
|
-
"gpt-5-mini",
|
|
65
|
-
"gpt-5-mini-2025-08-07",
|
|
66
|
-
"gpt-5-nano",
|
|
67
|
-
"gpt-5-nano-2025-08-07",
|
|
68
|
-
"gpt-5-chat-latest",
|
|
69
|
-
]
|
|
70
|
-
|
|
71
|
-
unsupported_log_probs_gpt_models = [
|
|
72
|
-
"o1",
|
|
73
|
-
"o1-preview",
|
|
74
|
-
"o1-2024-12-17",
|
|
75
|
-
"o1-preview-2024-09-12",
|
|
76
|
-
"o1-mini",
|
|
77
|
-
"o1-mini-2024-09-12",
|
|
78
|
-
"o3-mini",
|
|
79
|
-
"o3-mini-2025-01-31",
|
|
80
|
-
"o4-mini",
|
|
81
|
-
"o4-mini-2025-04-16",
|
|
82
|
-
"gpt-4.5-preview-2025-02-27",
|
|
83
|
-
"gpt-5",
|
|
84
|
-
"gpt-5-2025-08-07",
|
|
85
|
-
"gpt-5-mini",
|
|
86
|
-
"gpt-5-mini-2025-08-07",
|
|
87
|
-
"gpt-5-nano",
|
|
88
|
-
"gpt-5-nano-2025-08-07",
|
|
89
|
-
"gpt-5-chat-latest",
|
|
90
|
-
]
|
|
91
|
-
|
|
92
|
-
unsupported_log_probs_multimodal_gpt_models = [
|
|
93
|
-
"o1",
|
|
94
|
-
"o1-preview",
|
|
95
|
-
"o1-2024-12-17",
|
|
96
|
-
"o1-preview-2024-09-12",
|
|
97
|
-
"gpt-4.5-preview-2025-02-27",
|
|
98
|
-
"o4-mini",
|
|
99
|
-
]
|
|
100
|
-
|
|
101
|
-
structured_outputs_models = [
|
|
102
|
-
"gpt-4o",
|
|
103
|
-
"gpt-4o-2024-05-13",
|
|
104
|
-
"gpt-4o-2024-08-06",
|
|
105
|
-
"gpt-4o-2024-11-20",
|
|
106
|
-
"gpt-4o-mini",
|
|
107
|
-
"gpt-4o-mini-2024-07-18",
|
|
108
|
-
"gpt-4.1",
|
|
109
|
-
"gpt-4.1-mini",
|
|
110
|
-
"gpt-4.1-nano",
|
|
111
|
-
"o1",
|
|
112
|
-
"o1-preview",
|
|
113
|
-
"o1-2024-12-17",
|
|
114
|
-
"o3-mini",
|
|
115
|
-
"o3-mini-2025-01-31",
|
|
116
|
-
"o4-mini",
|
|
117
|
-
"o4-mini-2025-04-16",
|
|
118
|
-
"gpt-4.5-preview-2025-02-27",
|
|
119
|
-
"gpt-5",
|
|
120
|
-
"gpt-5-2025-08-07",
|
|
121
|
-
"gpt-5-mini",
|
|
122
|
-
"gpt-5-mini-2025-08-07",
|
|
123
|
-
"gpt-5-nano",
|
|
124
|
-
"gpt-5-nano-2025-08-07",
|
|
125
|
-
]
|
|
126
|
-
|
|
127
|
-
json_mode_models = [
|
|
128
|
-
"gpt-3.5-turbo",
|
|
129
|
-
"gpt-3.5-turbo-0125",
|
|
130
|
-
"gpt-3.5-turbo-1106",
|
|
131
|
-
"gpt-4-0125-preview",
|
|
132
|
-
"gpt-4-1106-preview",
|
|
133
|
-
"gpt-4-turbo",
|
|
134
|
-
"gpt-4-turbo-2024-04-09",
|
|
135
|
-
"gpt-4-turbo-preview",
|
|
136
|
-
"gpt-4-32k",
|
|
137
|
-
"gpt-4-32k-0613",
|
|
138
|
-
]
|
|
139
|
-
|
|
140
|
-
model_pricing = {
|
|
141
|
-
"gpt-4o-mini": {"input": 0.150 / 1e6, "output": 0.600 / 1e6},
|
|
142
|
-
"gpt-4o": {"input": 2.50 / 1e6, "output": 10.00 / 1e6},
|
|
143
|
-
"gpt-4-turbo": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
|
|
144
|
-
"gpt-4-turbo-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
|
|
145
|
-
"gpt-4-0125-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
|
|
146
|
-
"gpt-4-1106-preview": {"input": 10.00 / 1e6, "output": 30.00 / 1e6},
|
|
147
|
-
"gpt-4": {"input": 30.00 / 1e6, "output": 60.00 / 1e6},
|
|
148
|
-
"gpt-4-32k": {"input": 60.00 / 1e6, "output": 120.00 / 1e6},
|
|
149
|
-
"gpt-3.5-turbo-1106": {"input": 1.00 / 1e6, "output": 2.00 / 1e6},
|
|
150
|
-
"gpt-3.5-turbo": {"input": 0.50 / 1e6, "output": 1.50 / 1e6},
|
|
151
|
-
"gpt-3.5-turbo-16k": {"input": 3.00 / 1e6, "output": 4.00 / 1e6},
|
|
152
|
-
"gpt-3.5-turbo-0125": {"input": 0.50 / 1e6, "output": 1.50 / 1e6},
|
|
153
|
-
"gpt-3.5-turbo-instruct": {"input": 1.50 / 1e6, "output": 2.00 / 1e6},
|
|
154
|
-
"o1": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
|
|
155
|
-
"o1-preview": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
|
|
156
|
-
"o1-2024-12-17": {"input": 15.00 / 1e6, "output": 60.00 / 1e6},
|
|
157
|
-
"o3-mini": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
|
|
158
|
-
"o3-mini-2025-01-31": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
|
|
159
|
-
"o4-mini": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
|
|
160
|
-
"o4-mini-2025-04-16": {"input": 1.10 / 1e6, "output": 4.40 / 1e6},
|
|
161
|
-
"gpt-4.1": {
|
|
162
|
-
"input": 2.00 / 1e6,
|
|
163
|
-
"output": 8.00 / 1e6,
|
|
164
|
-
},
|
|
165
|
-
"gpt-4.1-mini": {
|
|
166
|
-
"input": 0.4 / 1e6,
|
|
167
|
-
"output": 1.60 / 1e6,
|
|
168
|
-
},
|
|
169
|
-
"gpt-4.1-nano": {
|
|
170
|
-
"input": 0.1 / 1e6,
|
|
171
|
-
"output": 0.4 / 1e6,
|
|
172
|
-
},
|
|
173
|
-
"gpt-4.5-preview": {
|
|
174
|
-
"input": 75.00 / 1e6,
|
|
175
|
-
"output": 150.00 / 1e6,
|
|
176
|
-
},
|
|
177
|
-
"gpt-5": {
|
|
178
|
-
"input": 1.25 / 1e6,
|
|
179
|
-
"output": 10.00 / 1e6,
|
|
180
|
-
},
|
|
181
|
-
"gpt-5-2025-08-07": {
|
|
182
|
-
"input": 1.25 / 1e6,
|
|
183
|
-
"output": 10.00 / 1e6,
|
|
184
|
-
},
|
|
185
|
-
"gpt-5-mini": {
|
|
186
|
-
"input": 0.25 / 1e6,
|
|
187
|
-
"output": 2.00 / 1e6,
|
|
188
|
-
},
|
|
189
|
-
"gpt-5-mini-2025-08-07": {
|
|
190
|
-
"input": 0.25 / 1e6,
|
|
191
|
-
"output": 2.00 / 1e6,
|
|
192
|
-
},
|
|
193
|
-
"gpt-5-nano": {
|
|
194
|
-
"input": 0.05 / 1e6,
|
|
195
|
-
"output": 0.40 / 1e6,
|
|
196
|
-
},
|
|
197
|
-
"gpt-5-nano-2025-08-07": {
|
|
198
|
-
"input": 0.05 / 1e6,
|
|
199
|
-
"output": 0.40 / 1e6,
|
|
200
|
-
},
|
|
201
|
-
"gpt-5-chat-latest": {
|
|
202
|
-
"input": 1.25 / 1e6,
|
|
203
|
-
"output": 10.00 / 1e6,
|
|
204
|
-
},
|
|
205
|
-
}
|
|
206
|
-
|
|
207
33
|
default_gpt_model = "gpt-4.1"
|
|
208
34
|
|
|
209
|
-
# Thinking models that require temperature=1
|
|
210
|
-
models_requiring_temperature_1 = [
|
|
211
|
-
"o1",
|
|
212
|
-
"o1-2024-12-17",
|
|
213
|
-
"o1-mini",
|
|
214
|
-
"o1-mini-2024-09-12",
|
|
215
|
-
"o3-mini",
|
|
216
|
-
"o3-mini-2025-01-31",
|
|
217
|
-
"o4-mini",
|
|
218
|
-
"o4-mini-2025-04-16",
|
|
219
|
-
"gpt-5",
|
|
220
|
-
"gpt-5-2025-08-07",
|
|
221
|
-
"gpt-5-mini",
|
|
222
|
-
"gpt-5-mini-2025-08-07",
|
|
223
|
-
"gpt-5-nano",
|
|
224
|
-
"gpt-5-nano-2025-08-07",
|
|
225
|
-
]
|
|
226
|
-
|
|
227
35
|
|
|
228
36
|
def _request_timeout_seconds() -> float:
|
|
229
37
|
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
@@ -236,25 +44,20 @@ _ALIAS_MAP = {
|
|
|
236
44
|
|
|
237
45
|
|
|
238
46
|
class GPTModel(DeepEvalBaseLLM):
|
|
239
|
-
valid_multimodal_models = [
|
|
240
|
-
"gpt-4o",
|
|
241
|
-
"gpt-4o-mini",
|
|
242
|
-
"gpt-4.1",
|
|
243
|
-
"gpt-4.1-mini",
|
|
244
|
-
"gpt-5",
|
|
245
|
-
]
|
|
246
47
|
|
|
247
48
|
def __init__(
|
|
248
49
|
self,
|
|
249
50
|
model: Optional[str] = None,
|
|
250
51
|
api_key: Optional[str] = None,
|
|
251
52
|
base_url: Optional[str] = None,
|
|
252
|
-
temperature: float =
|
|
53
|
+
temperature: Optional[float] = None,
|
|
253
54
|
cost_per_input_token: Optional[float] = None,
|
|
254
55
|
cost_per_output_token: Optional[float] = None,
|
|
255
56
|
generation_kwargs: Optional[Dict] = None,
|
|
256
57
|
**kwargs,
|
|
257
58
|
):
|
|
59
|
+
settings = get_settings()
|
|
60
|
+
|
|
258
61
|
normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
|
|
259
62
|
"GPTModel",
|
|
260
63
|
kwargs,
|
|
@@ -265,8 +68,10 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
265
68
|
if api_key is None and "api_key" in alias_values:
|
|
266
69
|
api_key = alias_values["api_key"]
|
|
267
70
|
|
|
268
|
-
settings = get_settings()
|
|
269
71
|
model = model or settings.OPENAI_MODEL_NAME
|
|
72
|
+
if model is None:
|
|
73
|
+
model = default_gpt_model
|
|
74
|
+
|
|
270
75
|
cost_per_input_token = (
|
|
271
76
|
cost_per_input_token
|
|
272
77
|
if cost_per_input_token is not None
|
|
@@ -278,71 +83,80 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
278
83
|
else settings.OPENAI_COST_PER_OUTPUT_TOKEN
|
|
279
84
|
)
|
|
280
85
|
|
|
281
|
-
if model is None:
|
|
282
|
-
model = default_gpt_model
|
|
283
|
-
|
|
284
|
-
if isinstance(model, str):
|
|
285
|
-
model = parse_model_name(model)
|
|
286
|
-
if model not in valid_gpt_models:
|
|
287
|
-
raise ValueError(
|
|
288
|
-
f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}"
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
if model not in model_pricing:
|
|
292
|
-
if cost_per_input_token is None or cost_per_output_token is None:
|
|
293
|
-
raise ValueError(
|
|
294
|
-
f"No pricing available for `{model}`. "
|
|
295
|
-
"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `GPTModel`, "
|
|
296
|
-
"or set them via the CLI:\n"
|
|
297
|
-
" deepeval set-openai --model=[...] --cost_per_input_token=[...] --cost_per_output_token=[...]"
|
|
298
|
-
)
|
|
299
|
-
else:
|
|
300
|
-
model_pricing[model] = {
|
|
301
|
-
"input": float(cost_per_input_token),
|
|
302
|
-
"output": float(cost_per_output_token),
|
|
303
|
-
}
|
|
304
|
-
|
|
305
86
|
if api_key is not None:
|
|
306
87
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
307
|
-
self.api_key: SecretStr
|
|
88
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
308
89
|
else:
|
|
309
|
-
self.api_key =
|
|
90
|
+
self.api_key = settings.OPENAI_API_KEY
|
|
310
91
|
|
|
311
|
-
self.base_url =
|
|
92
|
+
self.base_url = (
|
|
93
|
+
str(base_url).rstrip("/") if base_url is not None else None
|
|
94
|
+
)
|
|
312
95
|
# args and kwargs will be passed to the underlying model, in load_model function
|
|
313
96
|
|
|
314
|
-
|
|
315
|
-
|
|
97
|
+
if temperature is not None:
|
|
98
|
+
temperature = float(temperature)
|
|
99
|
+
elif settings.TEMPERATURE is not None:
|
|
100
|
+
temperature = settings.TEMPERATURE
|
|
101
|
+
else:
|
|
102
|
+
temperature = 0.0
|
|
103
|
+
|
|
104
|
+
if isinstance(model, str):
|
|
105
|
+
model = parse_model_name(model)
|
|
106
|
+
|
|
107
|
+
self.model_data = OPENAI_MODELS_DATA.get(model)
|
|
108
|
+
|
|
109
|
+
# Auto-adjust temperature for known models that require it
|
|
110
|
+
if self.model_data.supports_temperature is False:
|
|
316
111
|
temperature = 1
|
|
317
112
|
|
|
113
|
+
# validation
|
|
114
|
+
cost_per_input_token, cost_per_output_token = require_costs(
|
|
115
|
+
self.model_data,
|
|
116
|
+
model,
|
|
117
|
+
"OPENAI_COST_PER_INPUT_TOKEN",
|
|
118
|
+
"OPENAI_COST_PER_OUTPUT_TOKEN",
|
|
119
|
+
cost_per_input_token,
|
|
120
|
+
cost_per_output_token,
|
|
121
|
+
)
|
|
122
|
+
self.model_data.input_price = cost_per_input_token
|
|
123
|
+
self.model_data.output_price = cost_per_output_token
|
|
124
|
+
|
|
318
125
|
if temperature < 0:
|
|
319
|
-
raise
|
|
126
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
127
|
+
|
|
320
128
|
self.temperature = temperature
|
|
321
129
|
# Keep sanitized kwargs for client call to strip legacy keys
|
|
322
130
|
self.kwargs = normalized_kwargs
|
|
323
|
-
self.
|
|
131
|
+
self.kwargs.pop("temperature", None)
|
|
132
|
+
|
|
133
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
134
|
+
self.generation_kwargs.pop("temperature", None)
|
|
135
|
+
|
|
324
136
|
super().__init__(model)
|
|
325
137
|
|
|
326
|
-
|
|
327
|
-
# Generate functions
|
|
328
|
-
|
|
138
|
+
######################
|
|
139
|
+
# Generate functions #
|
|
140
|
+
######################
|
|
329
141
|
|
|
330
142
|
@retry_openai
|
|
331
143
|
def generate(
|
|
332
144
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
333
|
-
) -> Tuple[Union[str,
|
|
145
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
334
146
|
client = self.load_model(async_mode=False)
|
|
335
147
|
|
|
336
148
|
if check_if_multimodal(prompt):
|
|
337
149
|
prompt = convert_to_multi_modal_array(input=prompt)
|
|
338
|
-
|
|
150
|
+
content = self.generate_content(prompt)
|
|
151
|
+
else:
|
|
152
|
+
content = [{"type": "text", "text": prompt}]
|
|
339
153
|
|
|
340
154
|
if schema:
|
|
341
|
-
if self.
|
|
155
|
+
if self.supports_structured_outputs() is True:
|
|
342
156
|
completion = client.beta.chat.completions.parse(
|
|
343
157
|
model=self.name,
|
|
344
158
|
messages=[
|
|
345
|
-
{"role": "user", "content":
|
|
159
|
+
{"role": "user", "content": content},
|
|
346
160
|
],
|
|
347
161
|
response_format=schema,
|
|
348
162
|
temperature=self.temperature,
|
|
@@ -356,11 +170,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
356
170
|
completion.usage.completion_tokens,
|
|
357
171
|
)
|
|
358
172
|
return structured_output, cost
|
|
359
|
-
if self.
|
|
173
|
+
if self.supports_json_mode() is True:
|
|
360
174
|
completion = client.beta.chat.completions.parse(
|
|
361
175
|
model=self.name,
|
|
362
176
|
messages=[
|
|
363
|
-
{"role": "user", "content":
|
|
177
|
+
{"role": "user", "content": content},
|
|
364
178
|
],
|
|
365
179
|
response_format={"type": "json_object"},
|
|
366
180
|
temperature=self.temperature,
|
|
@@ -377,7 +191,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
377
191
|
|
|
378
192
|
completion = client.chat.completions.create(
|
|
379
193
|
model=self.name,
|
|
380
|
-
messages=[{"role": "user", "content":
|
|
194
|
+
messages=[{"role": "user", "content": content}],
|
|
381
195
|
temperature=self.temperature,
|
|
382
196
|
**self.generation_kwargs,
|
|
383
197
|
)
|
|
@@ -399,14 +213,16 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
399
213
|
|
|
400
214
|
if check_if_multimodal(prompt):
|
|
401
215
|
prompt = convert_to_multi_modal_array(input=prompt)
|
|
402
|
-
|
|
216
|
+
content = self.generate_content(prompt)
|
|
217
|
+
else:
|
|
218
|
+
content = [{"type": "text", "text": prompt}]
|
|
403
219
|
|
|
404
220
|
if schema:
|
|
405
|
-
if self.
|
|
221
|
+
if self.supports_structured_outputs() is True:
|
|
406
222
|
completion = await client.beta.chat.completions.parse(
|
|
407
223
|
model=self.name,
|
|
408
224
|
messages=[
|
|
409
|
-
{"role": "user", "content":
|
|
225
|
+
{"role": "user", "content": content},
|
|
410
226
|
],
|
|
411
227
|
response_format=schema,
|
|
412
228
|
temperature=self.temperature,
|
|
@@ -420,11 +236,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
420
236
|
completion.usage.completion_tokens,
|
|
421
237
|
)
|
|
422
238
|
return structured_output, cost
|
|
423
|
-
if self.
|
|
239
|
+
if self.supports_json_mode() is True:
|
|
424
240
|
completion = await client.beta.chat.completions.parse(
|
|
425
241
|
model=self.name,
|
|
426
242
|
messages=[
|
|
427
|
-
{"role": "user", "content":
|
|
243
|
+
{"role": "user", "content": content},
|
|
428
244
|
],
|
|
429
245
|
response_format={"type": "json_object"},
|
|
430
246
|
temperature=self.temperature,
|
|
@@ -441,7 +257,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
441
257
|
|
|
442
258
|
completion = await client.chat.completions.create(
|
|
443
259
|
model=self.name,
|
|
444
|
-
messages=[{"role": "user", "content":
|
|
260
|
+
messages=[{"role": "user", "content": content}],
|
|
445
261
|
temperature=self.temperature,
|
|
446
262
|
**self.generation_kwargs,
|
|
447
263
|
)
|
|
@@ -455,9 +271,9 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
455
271
|
else:
|
|
456
272
|
return output, cost
|
|
457
273
|
|
|
458
|
-
|
|
459
|
-
# Other generate functions
|
|
460
|
-
|
|
274
|
+
############################
|
|
275
|
+
# Other generate functions #
|
|
276
|
+
############################
|
|
461
277
|
|
|
462
278
|
@retry_openai
|
|
463
279
|
def generate_raw_response(
|
|
@@ -466,13 +282,26 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
466
282
|
top_logprobs: int = 5,
|
|
467
283
|
) -> Tuple[ChatCompletion, float]:
|
|
468
284
|
# Generate completion
|
|
285
|
+
model_name = self.name
|
|
286
|
+
is_multimodal = check_if_multimodal(prompt)
|
|
287
|
+
|
|
288
|
+
# validate that this model supports logprobs
|
|
289
|
+
if self.supports_log_probs() is False:
|
|
290
|
+
raise DeepEvalError(
|
|
291
|
+
f"Model `{model_name}` does not support `logprobs` / `top_logprobs`. "
|
|
292
|
+
"Please use a different OpenAI model (for example `gpt-4.1` or `gpt-4o`) "
|
|
293
|
+
"when calling `generate_raw_response`."
|
|
294
|
+
)
|
|
295
|
+
|
|
469
296
|
client = self.load_model(async_mode=False)
|
|
470
|
-
if
|
|
297
|
+
if is_multimodal:
|
|
471
298
|
prompt = convert_to_multi_modal_array(input=prompt)
|
|
472
|
-
|
|
299
|
+
content = self.generate_content(prompt)
|
|
300
|
+
else:
|
|
301
|
+
content = [{"type": "text", "text": prompt}]
|
|
473
302
|
completion = client.chat.completions.create(
|
|
474
303
|
model=self.name,
|
|
475
|
-
messages=[{"role": "user", "content":
|
|
304
|
+
messages=[{"role": "user", "content": content}],
|
|
476
305
|
temperature=self.temperature,
|
|
477
306
|
logprobs=True,
|
|
478
307
|
top_logprobs=top_logprobs,
|
|
@@ -492,13 +321,26 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
492
321
|
top_logprobs: int = 5,
|
|
493
322
|
) -> Tuple[ChatCompletion, float]:
|
|
494
323
|
# Generate completion
|
|
324
|
+
model_name = self.name
|
|
325
|
+
is_multimodal = check_if_multimodal(prompt)
|
|
326
|
+
|
|
327
|
+
# validate that this model supports logprobs
|
|
328
|
+
if self.supports_log_probs() is False:
|
|
329
|
+
raise DeepEvalError(
|
|
330
|
+
f"Model `{model_name}` does not support `logprobs` / `top_logprobs`. "
|
|
331
|
+
"Please use a different OpenAI model (for example `gpt-4.1` or `gpt-4o`) "
|
|
332
|
+
"when calling `a_generate_raw_response`."
|
|
333
|
+
)
|
|
334
|
+
|
|
495
335
|
client = self.load_model(async_mode=True)
|
|
496
|
-
if
|
|
336
|
+
if is_multimodal:
|
|
497
337
|
prompt = convert_to_multi_modal_array(input=prompt)
|
|
498
|
-
|
|
338
|
+
content = self.generate_content(prompt)
|
|
339
|
+
else:
|
|
340
|
+
content = [{"type": "text", "text": prompt}]
|
|
499
341
|
completion = await client.chat.completions.create(
|
|
500
342
|
model=self.name,
|
|
501
|
-
messages=[{"role": "user", "content":
|
|
343
|
+
messages=[{"role": "user", "content": content}],
|
|
502
344
|
temperature=self.temperature,
|
|
503
345
|
logprobs=True,
|
|
504
346
|
top_logprobs=top_logprobs,
|
|
@@ -514,14 +356,16 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
514
356
|
@retry_openai
|
|
515
357
|
def generate_samples(
|
|
516
358
|
self, prompt: str, n: int, temperature: float
|
|
517
|
-
) ->
|
|
359
|
+
) -> list[str]:
|
|
518
360
|
client = self.load_model(async_mode=False)
|
|
519
361
|
if check_if_multimodal(prompt):
|
|
520
362
|
prompt = convert_to_multi_modal_array(input=prompt)
|
|
521
|
-
|
|
363
|
+
content = self.generate_content(prompt)
|
|
364
|
+
else:
|
|
365
|
+
content = [{"type": "text", "text": prompt}]
|
|
522
366
|
response = client.chat.completions.create(
|
|
523
367
|
model=self.name,
|
|
524
|
-
messages=[{"role": "user", "content":
|
|
368
|
+
messages=[{"role": "user", "content": content}],
|
|
525
369
|
n=n,
|
|
526
370
|
temperature=temperature,
|
|
527
371
|
**self.generation_kwargs,
|
|
@@ -529,55 +373,74 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
529
373
|
completions = [choice.message.content for choice in response.choices]
|
|
530
374
|
return completions
|
|
531
375
|
|
|
532
|
-
|
|
533
|
-
# Utilities
|
|
534
|
-
|
|
376
|
+
#############
|
|
377
|
+
# Utilities #
|
|
378
|
+
#############
|
|
535
379
|
|
|
536
380
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
381
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
382
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
383
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
384
|
+
return input_cost + output_cost
|
|
385
|
+
|
|
386
|
+
#########################
|
|
387
|
+
# Capabilities #
|
|
388
|
+
#########################
|
|
389
|
+
|
|
390
|
+
def supports_log_probs(self) -> Union[bool, None]:
|
|
391
|
+
return self.model_data.supports_log_probs
|
|
392
|
+
|
|
393
|
+
def supports_temperature(self) -> Union[bool, None]:
|
|
394
|
+
return self.model_data.supports_temperature
|
|
395
|
+
|
|
396
|
+
def supports_multimodal(self) -> Union[bool, None]:
|
|
397
|
+
return self.model_data.supports_multimodal
|
|
398
|
+
|
|
399
|
+
def supports_structured_outputs(self) -> Union[bool, None]:
|
|
400
|
+
"""
|
|
401
|
+
OpenAI models that natively enforce typed structured outputs.
|
|
402
|
+
Used by generate(...) when a schema is provided.
|
|
403
|
+
"""
|
|
404
|
+
return self.model_data.supports_structured_outputs
|
|
405
|
+
|
|
406
|
+
def supports_json_mode(self) -> Union[bool, None]:
|
|
407
|
+
"""
|
|
408
|
+
OpenAI models that enforce JSON mode
|
|
409
|
+
"""
|
|
410
|
+
return self.model_data.supports_json
|
|
542
411
|
|
|
543
412
|
#########
|
|
544
413
|
# Model #
|
|
545
414
|
#########
|
|
546
415
|
|
|
547
|
-
def
|
|
548
|
-
self, multimodal_input: List[Union[str, MLLMImage]] =
|
|
416
|
+
def generate_content(
|
|
417
|
+
self, multimodal_input: Optional[List[Union[str, MLLMImage]]] = None
|
|
549
418
|
):
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
},
|
|
564
|
-
}
|
|
419
|
+
multimodal_input = [] if multimodal_input is None else multimodal_input
|
|
420
|
+
content = []
|
|
421
|
+
for element in multimodal_input:
|
|
422
|
+
if isinstance(element, str):
|
|
423
|
+
content.append({"type": "text", "text": element})
|
|
424
|
+
elif isinstance(element, MLLMImage):
|
|
425
|
+
if element.url and not element.local:
|
|
426
|
+
content.append(
|
|
427
|
+
{
|
|
428
|
+
"type": "image_url",
|
|
429
|
+
"image_url": {"url": element.url},
|
|
430
|
+
}
|
|
431
|
+
)
|
|
565
432
|
else:
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
"
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
pil_image.save(image_buffer, format="JPEG")
|
|
578
|
-
image_bytes = image_buffer.getvalue()
|
|
579
|
-
base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
|
|
580
|
-
return base64_encoded_image
|
|
433
|
+
element.ensure_images_loaded()
|
|
434
|
+
data_uri = (
|
|
435
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
436
|
+
)
|
|
437
|
+
content.append(
|
|
438
|
+
{
|
|
439
|
+
"type": "image_url",
|
|
440
|
+
"image_url": {"url": data_uri},
|
|
441
|
+
}
|
|
442
|
+
)
|
|
443
|
+
return content
|
|
581
444
|
|
|
582
445
|
def load_model(self, async_mode: bool = False):
|
|
583
446
|
if not async_mode:
|
|
@@ -620,10 +483,5 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
620
483
|
return cls(**kw)
|
|
621
484
|
raise
|
|
622
485
|
|
|
623
|
-
def supports_multimodal(self):
|
|
624
|
-
if self.name in GPTModel.valid_multimodal_models:
|
|
625
|
-
return True
|
|
626
|
-
return False
|
|
627
|
-
|
|
628
486
|
def get_model_name(self):
|
|
629
487
|
return f"{self.name}"
|