deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,15 +1,32 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import Optional, Tuple, Union, Dict
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Tuple, Union, Dict, List
|
|
3
2
|
from pydantic import BaseModel
|
|
3
|
+
import requests
|
|
4
|
+
import base64
|
|
5
|
+
import io
|
|
4
6
|
|
|
7
|
+
from deepeval.config.settings import get_settings
|
|
8
|
+
from deepeval.utils import require_dependency
|
|
5
9
|
from deepeval.models.retry_policy import (
|
|
6
10
|
create_retry_decorator,
|
|
7
11
|
)
|
|
8
|
-
|
|
12
|
+
from deepeval.utils import convert_to_multi_modal_array, check_if_multimodal
|
|
13
|
+
from deepeval.test_case import MLLMImage
|
|
9
14
|
from deepeval.models import DeepEvalBaseLLM
|
|
10
|
-
from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
|
|
11
15
|
from deepeval.constants import ProviderSlug as PS
|
|
12
16
|
|
|
17
|
+
valid_multimodal_models = [
|
|
18
|
+
"llava:7b",
|
|
19
|
+
"llava:13b",
|
|
20
|
+
"llava:34b",
|
|
21
|
+
"llama4",
|
|
22
|
+
"gemma3",
|
|
23
|
+
"qwen3-vl",
|
|
24
|
+
"qwen2.5-vl",
|
|
25
|
+
# TODO: Add more models later on by looking at their catelogue
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from ollama import ChatResponse
|
|
13
30
|
|
|
14
31
|
retry_ollama = create_retry_decorator(PS.OLLAMA)
|
|
15
32
|
|
|
@@ -23,20 +40,23 @@ class OllamaModel(DeepEvalBaseLLM):
|
|
|
23
40
|
generation_kwargs: Optional[Dict] = None,
|
|
24
41
|
**kwargs,
|
|
25
42
|
):
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
)
|
|
43
|
+
settings = get_settings()
|
|
44
|
+
model = model or settings.LOCAL_MODEL_NAME
|
|
29
45
|
self.base_url = (
|
|
30
46
|
base_url
|
|
31
|
-
or
|
|
47
|
+
or (
|
|
48
|
+
settings.LOCAL_MODEL_BASE_URL
|
|
49
|
+
and str(settings.LOCAL_MODEL_BASE_URL)
|
|
50
|
+
)
|
|
32
51
|
or "http://localhost:11434"
|
|
33
52
|
)
|
|
34
53
|
if temperature < 0:
|
|
35
54
|
raise ValueError("Temperature must be >= 0.")
|
|
36
55
|
self.temperature = temperature
|
|
56
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
37
57
|
self.kwargs = kwargs
|
|
38
58
|
self.generation_kwargs = generation_kwargs or {}
|
|
39
|
-
super().__init__(
|
|
59
|
+
super().__init__(model)
|
|
40
60
|
|
|
41
61
|
###############################################
|
|
42
62
|
# Other generate functions
|
|
@@ -47,9 +67,17 @@ class OllamaModel(DeepEvalBaseLLM):
|
|
|
47
67
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
48
68
|
) -> Tuple[Union[str, Dict], float]:
|
|
49
69
|
chat_model = self.load_model()
|
|
70
|
+
|
|
71
|
+
if check_if_multimodal(prompt):
|
|
72
|
+
prompt = convert_to_multi_modal_array(prompt)
|
|
73
|
+
messages = self.generate_messages(prompt)
|
|
74
|
+
else:
|
|
75
|
+
messages = [{"role": "user", "content": prompt}]
|
|
76
|
+
print(messages)
|
|
77
|
+
|
|
50
78
|
response: ChatResponse = chat_model.chat(
|
|
51
|
-
model=self.
|
|
52
|
-
messages=
|
|
79
|
+
model=self.name,
|
|
80
|
+
messages=messages,
|
|
53
81
|
format=schema.model_json_schema() if schema else None,
|
|
54
82
|
options={
|
|
55
83
|
**{"temperature": self.temperature},
|
|
@@ -70,9 +98,16 @@ class OllamaModel(DeepEvalBaseLLM):
|
|
|
70
98
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
71
99
|
) -> Tuple[str, float]:
|
|
72
100
|
chat_model = self.load_model(async_mode=True)
|
|
101
|
+
|
|
102
|
+
if check_if_multimodal(prompt):
|
|
103
|
+
prompt = convert_to_multi_modal_array(prompt)
|
|
104
|
+
messages = self.generate_messages(prompt)
|
|
105
|
+
else:
|
|
106
|
+
messages = [{"role": "user", "content": prompt}]
|
|
107
|
+
|
|
73
108
|
response: ChatResponse = await chat_model.chat(
|
|
74
|
-
model=self.
|
|
75
|
-
messages=
|
|
109
|
+
model=self.name,
|
|
110
|
+
messages=messages,
|
|
76
111
|
format=schema.model_json_schema() if schema else None,
|
|
77
112
|
options={
|
|
78
113
|
**{"temperature": self.temperature},
|
|
@@ -88,17 +123,94 @@ class OllamaModel(DeepEvalBaseLLM):
|
|
|
88
123
|
0,
|
|
89
124
|
)
|
|
90
125
|
|
|
126
|
+
def generate_messages(
|
|
127
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
128
|
+
):
|
|
129
|
+
messages = []
|
|
130
|
+
for ele in multimodal_input:
|
|
131
|
+
if isinstance(ele, str):
|
|
132
|
+
messages.append(
|
|
133
|
+
{
|
|
134
|
+
"role": "user",
|
|
135
|
+
"content": ele,
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
elif isinstance(ele, MLLMImage):
|
|
139
|
+
img_b64 = self.convert_to_base64(ele.url, ele.local)
|
|
140
|
+
if img_b64 is not None:
|
|
141
|
+
messages.append(
|
|
142
|
+
{
|
|
143
|
+
"role": "user",
|
|
144
|
+
"images": [img_b64],
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
return messages
|
|
148
|
+
|
|
149
|
+
###############################################
|
|
150
|
+
# Utilities
|
|
151
|
+
###############################################
|
|
152
|
+
|
|
153
|
+
def convert_to_base64(self, image_source: str, is_local: bool) -> str:
|
|
154
|
+
from PIL import Image
|
|
155
|
+
|
|
156
|
+
settings = get_settings()
|
|
157
|
+
try:
|
|
158
|
+
if not is_local:
|
|
159
|
+
response = requests.get(
|
|
160
|
+
image_source,
|
|
161
|
+
stream=True,
|
|
162
|
+
timeout=(
|
|
163
|
+
settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
|
|
164
|
+
settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
|
168
|
+
image = Image.open(io.BytesIO(response.content))
|
|
169
|
+
else:
|
|
170
|
+
image = Image.open(image_source)
|
|
171
|
+
|
|
172
|
+
buffered = io.BytesIO()
|
|
173
|
+
image.save(buffered, format="JPEG")
|
|
174
|
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
|
175
|
+
return img_str
|
|
176
|
+
|
|
177
|
+
except (requests.exceptions.RequestException, OSError) as e:
|
|
178
|
+
# Log, then rethrow so @retry_ollama can retry generate_messages() on network failures
|
|
179
|
+
print(f"Image fetch/encode failed: {e}")
|
|
180
|
+
raise
|
|
181
|
+
except Exception as e:
|
|
182
|
+
print(f"Error converting image to base64: {e}")
|
|
183
|
+
return None
|
|
184
|
+
|
|
91
185
|
###############################################
|
|
92
186
|
# Model
|
|
93
187
|
###############################################
|
|
94
188
|
|
|
95
189
|
def load_model(self, async_mode: bool = False):
|
|
190
|
+
ollama = require_dependency(
|
|
191
|
+
"ollama",
|
|
192
|
+
provider_label="OllamaModel",
|
|
193
|
+
install_hint="Install it with `pip install ollama`.",
|
|
194
|
+
)
|
|
96
195
|
if not async_mode:
|
|
97
|
-
return self._build_client(Client)
|
|
98
|
-
return self._build_client(AsyncClient)
|
|
196
|
+
return self._build_client(ollama.Client)
|
|
197
|
+
return self._build_client(ollama.AsyncClient)
|
|
198
|
+
|
|
199
|
+
def _client_kwargs(self) -> Dict:
|
|
200
|
+
"""Return kwargs forwarded to the underlying Ollama Client/AsyncClient."""
|
|
201
|
+
return dict(self.kwargs or {})
|
|
99
202
|
|
|
100
203
|
def _build_client(self, cls):
|
|
101
|
-
|
|
204
|
+
kw = dict(
|
|
205
|
+
host=self.base_url,
|
|
206
|
+
**self._client_kwargs(),
|
|
207
|
+
)
|
|
208
|
+
return cls(**kw)
|
|
209
|
+
|
|
210
|
+
def supports_multimodal(self):
|
|
211
|
+
if self.name in valid_multimodal_models:
|
|
212
|
+
return True
|
|
213
|
+
return False
|
|
102
214
|
|
|
103
215
|
def get_model_name(self):
|
|
104
|
-
return f"{self.
|
|
216
|
+
return f"{self.name} (Ollama)"
|
|
@@ -1,18 +1,23 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from pydantic import BaseModel
|
|
5
|
-
|
|
3
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
4
|
+
from deepeval.test_case import MLLMImage
|
|
5
|
+
from pydantic import BaseModel, SecretStr
|
|
6
|
+
from io import BytesIO
|
|
6
7
|
from openai import (
|
|
7
8
|
OpenAI,
|
|
8
9
|
AsyncOpenAI,
|
|
9
10
|
)
|
|
10
|
-
|
|
11
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
11
12
|
from deepeval.config.settings import get_settings
|
|
12
13
|
from deepeval.constants import ProviderSlug as PS
|
|
13
14
|
from deepeval.models import DeepEvalBaseLLM
|
|
14
15
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
15
|
-
from deepeval.models.utils import
|
|
16
|
+
from deepeval.models.utils import (
|
|
17
|
+
parse_model_name,
|
|
18
|
+
require_secret_api_key,
|
|
19
|
+
normalize_kwargs_and_extract_aliases,
|
|
20
|
+
)
|
|
16
21
|
from deepeval.models.retry_policy import (
|
|
17
22
|
create_retry_decorator,
|
|
18
23
|
sdk_retries_for,
|
|
@@ -21,6 +26,7 @@ from deepeval.models.retry_policy import (
|
|
|
21
26
|
|
|
22
27
|
retry_openai = create_retry_decorator(PS.OPENAI)
|
|
23
28
|
|
|
29
|
+
|
|
24
30
|
valid_gpt_models = [
|
|
25
31
|
"gpt-3.5-turbo",
|
|
26
32
|
"gpt-3.5-turbo-0125",
|
|
@@ -83,6 +89,15 @@ unsupported_log_probs_gpt_models = [
|
|
|
83
89
|
"gpt-5-chat-latest",
|
|
84
90
|
]
|
|
85
91
|
|
|
92
|
+
unsupported_log_probs_multimodal_gpt_models = [
|
|
93
|
+
"o1",
|
|
94
|
+
"o1-preview",
|
|
95
|
+
"o1-2024-12-17",
|
|
96
|
+
"o1-preview-2024-09-12",
|
|
97
|
+
"gpt-4.5-preview-2025-02-27",
|
|
98
|
+
"o4-mini",
|
|
99
|
+
]
|
|
100
|
+
|
|
86
101
|
structured_outputs_models = [
|
|
87
102
|
"gpt-4o",
|
|
88
103
|
"gpt-4o-2024-05-13",
|
|
@@ -215,77 +230,98 @@ def _request_timeout_seconds() -> float:
|
|
|
215
230
|
return timeout if timeout > 0 else 30.0
|
|
216
231
|
|
|
217
232
|
|
|
233
|
+
_ALIAS_MAP = {
|
|
234
|
+
"api_key": ["_openai_api_key"],
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
218
238
|
class GPTModel(DeepEvalBaseLLM):
|
|
239
|
+
valid_multimodal_models = [
|
|
240
|
+
"gpt-4o",
|
|
241
|
+
"gpt-4o-mini",
|
|
242
|
+
"gpt-4.1",
|
|
243
|
+
"gpt-4.1-mini",
|
|
244
|
+
"gpt-5",
|
|
245
|
+
]
|
|
246
|
+
|
|
219
247
|
def __init__(
|
|
220
248
|
self,
|
|
221
249
|
model: Optional[str] = None,
|
|
222
|
-
|
|
250
|
+
api_key: Optional[str] = None,
|
|
223
251
|
base_url: Optional[str] = None,
|
|
252
|
+
temperature: float = 0,
|
|
224
253
|
cost_per_input_token: Optional[float] = None,
|
|
225
254
|
cost_per_output_token: Optional[float] = None,
|
|
226
|
-
temperature: float = 0,
|
|
227
255
|
generation_kwargs: Optional[Dict] = None,
|
|
228
256
|
**kwargs,
|
|
229
257
|
):
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
258
|
+
normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
|
|
259
|
+
"GPTModel",
|
|
260
|
+
kwargs,
|
|
261
|
+
_ALIAS_MAP,
|
|
233
262
|
)
|
|
263
|
+
|
|
264
|
+
# re-map depricated keywords to re-named positional args
|
|
265
|
+
if api_key is None and "api_key" in alias_values:
|
|
266
|
+
api_key = alias_values["api_key"]
|
|
267
|
+
|
|
268
|
+
settings = get_settings()
|
|
269
|
+
model = model or settings.OPENAI_MODEL_NAME
|
|
234
270
|
cost_per_input_token = (
|
|
235
271
|
cost_per_input_token
|
|
236
272
|
if cost_per_input_token is not None
|
|
237
|
-
else
|
|
238
|
-
ModelKeyValues.OPENAI_COST_PER_INPUT_TOKEN
|
|
239
|
-
)
|
|
273
|
+
else settings.OPENAI_COST_PER_INPUT_TOKEN
|
|
240
274
|
)
|
|
241
275
|
cost_per_output_token = (
|
|
242
276
|
cost_per_output_token
|
|
243
277
|
if cost_per_output_token is not None
|
|
244
|
-
else
|
|
245
|
-
ModelKeyValues.OPENAI_COST_PER_OUTPUT_TOKEN
|
|
246
|
-
)
|
|
278
|
+
else settings.OPENAI_COST_PER_OUTPUT_TOKEN
|
|
247
279
|
)
|
|
248
280
|
|
|
281
|
+
if model is None:
|
|
282
|
+
model = default_gpt_model
|
|
283
|
+
|
|
249
284
|
if isinstance(model, str):
|
|
250
|
-
|
|
251
|
-
if
|
|
285
|
+
model = parse_model_name(model)
|
|
286
|
+
if model not in valid_gpt_models:
|
|
252
287
|
raise ValueError(
|
|
253
288
|
f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}"
|
|
254
289
|
)
|
|
255
|
-
elif model is None:
|
|
256
|
-
model_name = default_gpt_model
|
|
257
290
|
|
|
258
|
-
if
|
|
291
|
+
if model not in model_pricing:
|
|
259
292
|
if cost_per_input_token is None or cost_per_output_token is None:
|
|
260
293
|
raise ValueError(
|
|
261
|
-
f"No pricing available for `{
|
|
294
|
+
f"No pricing available for `{model}`. "
|
|
262
295
|
"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `GPTModel`, "
|
|
263
296
|
"or set them via the CLI:\n"
|
|
264
297
|
" deepeval set-openai --model=[...] --cost_per_input_token=[...] --cost_per_output_token=[...]"
|
|
265
298
|
)
|
|
266
299
|
else:
|
|
267
|
-
model_pricing[
|
|
300
|
+
model_pricing[model] = {
|
|
268
301
|
"input": float(cost_per_input_token),
|
|
269
302
|
"output": float(cost_per_output_token),
|
|
270
303
|
}
|
|
271
304
|
|
|
272
|
-
|
|
273
|
-
|
|
305
|
+
if api_key is not None:
|
|
306
|
+
# keep it secret, keep it safe from serializings, logging and alike
|
|
307
|
+
self.api_key: SecretStr | None = SecretStr(api_key)
|
|
308
|
+
else:
|
|
309
|
+
self.api_key = get_settings().OPENAI_API_KEY
|
|
274
310
|
|
|
275
|
-
self._openai_api_key = _openai_api_key
|
|
276
311
|
self.base_url = base_url
|
|
277
312
|
# args and kwargs will be passed to the underlying model, in load_model function
|
|
278
313
|
|
|
279
314
|
# Auto-adjust temperature for models that require it
|
|
280
|
-
if
|
|
315
|
+
if model in models_requiring_temperature_1:
|
|
281
316
|
temperature = 1
|
|
282
317
|
|
|
283
318
|
if temperature < 0:
|
|
284
319
|
raise ValueError("Temperature must be >= 0.")
|
|
285
320
|
self.temperature = temperature
|
|
286
|
-
|
|
321
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
322
|
+
self.kwargs = normalized_kwargs
|
|
287
323
|
self.generation_kwargs = generation_kwargs or {}
|
|
288
|
-
super().__init__(
|
|
324
|
+
super().__init__(model)
|
|
289
325
|
|
|
290
326
|
###############################################
|
|
291
327
|
# Generate functions
|
|
@@ -296,10 +332,15 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
296
332
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
297
333
|
) -> Tuple[Union[str, Dict], float]:
|
|
298
334
|
client = self.load_model(async_mode=False)
|
|
335
|
+
|
|
336
|
+
if check_if_multimodal(prompt):
|
|
337
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
338
|
+
prompt = self.generate_prompt(prompt)
|
|
339
|
+
|
|
299
340
|
if schema:
|
|
300
|
-
if self.
|
|
341
|
+
if self.name in structured_outputs_models:
|
|
301
342
|
completion = client.beta.chat.completions.parse(
|
|
302
|
-
model=self.
|
|
343
|
+
model=self.name,
|
|
303
344
|
messages=[
|
|
304
345
|
{"role": "user", "content": prompt},
|
|
305
346
|
],
|
|
@@ -315,9 +356,9 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
315
356
|
completion.usage.completion_tokens,
|
|
316
357
|
)
|
|
317
358
|
return structured_output, cost
|
|
318
|
-
if self.
|
|
359
|
+
if self.name in json_mode_models:
|
|
319
360
|
completion = client.beta.chat.completions.parse(
|
|
320
|
-
model=self.
|
|
361
|
+
model=self.name,
|
|
321
362
|
messages=[
|
|
322
363
|
{"role": "user", "content": prompt},
|
|
323
364
|
],
|
|
@@ -335,7 +376,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
335
376
|
return schema.model_validate(json_output), cost
|
|
336
377
|
|
|
337
378
|
completion = client.chat.completions.create(
|
|
338
|
-
model=self.
|
|
379
|
+
model=self.name,
|
|
339
380
|
messages=[{"role": "user", "content": prompt}],
|
|
340
381
|
temperature=self.temperature,
|
|
341
382
|
**self.generation_kwargs,
|
|
@@ -355,10 +396,15 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
355
396
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
356
397
|
) -> Tuple[Union[str, BaseModel], float]:
|
|
357
398
|
client = self.load_model(async_mode=True)
|
|
399
|
+
|
|
400
|
+
if check_if_multimodal(prompt):
|
|
401
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
402
|
+
prompt = self.generate_prompt(prompt)
|
|
403
|
+
|
|
358
404
|
if schema:
|
|
359
|
-
if self.
|
|
405
|
+
if self.name in structured_outputs_models:
|
|
360
406
|
completion = await client.beta.chat.completions.parse(
|
|
361
|
-
model=self.
|
|
407
|
+
model=self.name,
|
|
362
408
|
messages=[
|
|
363
409
|
{"role": "user", "content": prompt},
|
|
364
410
|
],
|
|
@@ -374,9 +420,9 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
374
420
|
completion.usage.completion_tokens,
|
|
375
421
|
)
|
|
376
422
|
return structured_output, cost
|
|
377
|
-
if self.
|
|
423
|
+
if self.name in json_mode_models:
|
|
378
424
|
completion = await client.beta.chat.completions.parse(
|
|
379
|
-
model=self.
|
|
425
|
+
model=self.name,
|
|
380
426
|
messages=[
|
|
381
427
|
{"role": "user", "content": prompt},
|
|
382
428
|
],
|
|
@@ -394,7 +440,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
394
440
|
return schema.model_validate(json_output), cost
|
|
395
441
|
|
|
396
442
|
completion = await client.chat.completions.create(
|
|
397
|
-
model=self.
|
|
443
|
+
model=self.name,
|
|
398
444
|
messages=[{"role": "user", "content": prompt}],
|
|
399
445
|
temperature=self.temperature,
|
|
400
446
|
**self.generation_kwargs,
|
|
@@ -421,8 +467,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
421
467
|
) -> Tuple[ChatCompletion, float]:
|
|
422
468
|
# Generate completion
|
|
423
469
|
client = self.load_model(async_mode=False)
|
|
470
|
+
if check_if_multimodal(prompt):
|
|
471
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
472
|
+
prompt = self.generate_prompt(prompt)
|
|
424
473
|
completion = client.chat.completions.create(
|
|
425
|
-
model=self.
|
|
474
|
+
model=self.name,
|
|
426
475
|
messages=[{"role": "user", "content": prompt}],
|
|
427
476
|
temperature=self.temperature,
|
|
428
477
|
logprobs=True,
|
|
@@ -444,8 +493,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
444
493
|
) -> Tuple[ChatCompletion, float]:
|
|
445
494
|
# Generate completion
|
|
446
495
|
client = self.load_model(async_mode=True)
|
|
496
|
+
if check_if_multimodal(prompt):
|
|
497
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
498
|
+
prompt = self.generate_prompt(prompt)
|
|
447
499
|
completion = await client.chat.completions.create(
|
|
448
|
-
model=self.
|
|
500
|
+
model=self.name,
|
|
449
501
|
messages=[{"role": "user", "content": prompt}],
|
|
450
502
|
temperature=self.temperature,
|
|
451
503
|
logprobs=True,
|
|
@@ -464,8 +516,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
464
516
|
self, prompt: str, n: int, temperature: float
|
|
465
517
|
) -> Tuple[list[str], float]:
|
|
466
518
|
client = self.load_model(async_mode=False)
|
|
519
|
+
if check_if_multimodal(prompt):
|
|
520
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
521
|
+
prompt = self.generate_prompt(prompt)
|
|
467
522
|
response = client.chat.completions.create(
|
|
468
|
-
model=self.
|
|
523
|
+
model=self.name,
|
|
469
524
|
messages=[{"role": "user", "content": prompt}],
|
|
470
525
|
n=n,
|
|
471
526
|
temperature=temperature,
|
|
@@ -480,17 +535,49 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
480
535
|
|
|
481
536
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
482
537
|
# TODO: consider loggin a warning instead of defaulting to whole model pricing
|
|
483
|
-
pricing = model_pricing.get(self.
|
|
538
|
+
pricing = model_pricing.get(self.name, model_pricing)
|
|
484
539
|
input_cost = input_tokens * pricing["input"]
|
|
485
540
|
output_cost = output_tokens * pricing["output"]
|
|
486
541
|
return input_cost + output_cost
|
|
487
542
|
|
|
488
|
-
|
|
489
|
-
# Model
|
|
490
|
-
|
|
543
|
+
#########
|
|
544
|
+
# Model #
|
|
545
|
+
#########
|
|
491
546
|
|
|
492
|
-
def
|
|
493
|
-
|
|
547
|
+
def generate_prompt(
|
|
548
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
549
|
+
):
|
|
550
|
+
prompt = []
|
|
551
|
+
for ele in multimodal_input:
|
|
552
|
+
if isinstance(ele, str):
|
|
553
|
+
prompt.append({"type": "text", "text": ele})
|
|
554
|
+
elif isinstance(ele, MLLMImage):
|
|
555
|
+
if ele.local:
|
|
556
|
+
import PIL.Image
|
|
557
|
+
|
|
558
|
+
image = PIL.Image.open(ele.url)
|
|
559
|
+
visual_dict = {
|
|
560
|
+
"type": "image_url",
|
|
561
|
+
"image_url": {
|
|
562
|
+
"url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
|
|
563
|
+
},
|
|
564
|
+
}
|
|
565
|
+
else:
|
|
566
|
+
visual_dict = {
|
|
567
|
+
"type": "image_url",
|
|
568
|
+
"image_url": {"url": ele.url},
|
|
569
|
+
}
|
|
570
|
+
prompt.append(visual_dict)
|
|
571
|
+
return prompt
|
|
572
|
+
|
|
573
|
+
def encode_pil_image(self, pil_image):
|
|
574
|
+
image_buffer = BytesIO()
|
|
575
|
+
if pil_image.mode in ("RGBA", "LA", "P"):
|
|
576
|
+
pil_image = pil_image.convert("RGB")
|
|
577
|
+
pil_image.save(image_buffer, format="JPEG")
|
|
578
|
+
image_bytes = image_buffer.getvalue()
|
|
579
|
+
base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
|
|
580
|
+
return base64_encoded_image
|
|
494
581
|
|
|
495
582
|
def load_model(self, async_mode: bool = False):
|
|
496
583
|
if not async_mode:
|
|
@@ -512,9 +599,15 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
512
599
|
return kwargs
|
|
513
600
|
|
|
514
601
|
def _build_client(self, cls):
|
|
602
|
+
api_key = require_secret_api_key(
|
|
603
|
+
self.api_key,
|
|
604
|
+
provider_label="OpenAI",
|
|
605
|
+
env_var_name="OPENAI_API_KEY",
|
|
606
|
+
param_hint="`api_key` to GPTModel(...)",
|
|
607
|
+
)
|
|
515
608
|
|
|
516
609
|
kw = dict(
|
|
517
|
-
api_key=
|
|
610
|
+
api_key=api_key,
|
|
518
611
|
base_url=self.base_url,
|
|
519
612
|
**self._client_kwargs(),
|
|
520
613
|
)
|
|
@@ -526,3 +619,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
526
619
|
kw.pop("max_retries", None)
|
|
527
620
|
return cls(**kw)
|
|
528
621
|
raise
|
|
622
|
+
|
|
623
|
+
def supports_multimodal(self):
|
|
624
|
+
if self.name in GPTModel.valid_multimodal_models:
|
|
625
|
+
return True
|
|
626
|
+
return False
|
|
627
|
+
|
|
628
|
+
def get_model_name(self):
|
|
629
|
+
return f"{self.name}"
|