deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,17 +1,32 @@
|
|
|
1
1
|
import json
|
|
2
|
-
|
|
2
|
+
import requests
|
|
3
3
|
from pydantic import BaseModel, SecretStr
|
|
4
|
-
from
|
|
5
|
-
from typing import Optional, Dict
|
|
4
|
+
from typing import TYPE_CHECKING, Optional, Dict, List, Union
|
|
6
5
|
|
|
6
|
+
from deepeval.test_case import MLLMImage
|
|
7
7
|
from deepeval.config.settings import get_settings
|
|
8
8
|
from deepeval.models.utils import require_secret_api_key
|
|
9
9
|
from deepeval.models.retry_policy import (
|
|
10
10
|
create_retry_decorator,
|
|
11
11
|
)
|
|
12
|
+
from deepeval.utils import (
|
|
13
|
+
convert_to_multi_modal_array,
|
|
14
|
+
check_if_multimodal,
|
|
15
|
+
require_dependency,
|
|
16
|
+
)
|
|
12
17
|
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
13
18
|
from deepeval.constants import ProviderSlug as PS
|
|
14
|
-
|
|
19
|
+
|
|
20
|
+
valid_multimodal_models = [
|
|
21
|
+
"gemini-2.5-pro",
|
|
22
|
+
"gemini-2.5-flash",
|
|
23
|
+
"gemini-1.5-pro",
|
|
24
|
+
"gemini-1.5-flash",
|
|
25
|
+
# TODO: Add more models later
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from google.genai import Client
|
|
15
30
|
|
|
16
31
|
default_gemini_model = "gemini-1.5-pro"
|
|
17
32
|
|
|
@@ -28,7 +43,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
28
43
|
To use Vertex AI API, set project and location attributes.
|
|
29
44
|
|
|
30
45
|
Attributes:
|
|
31
|
-
|
|
46
|
+
model: Name of the Gemini model to use
|
|
32
47
|
api_key: Google API key for authentication
|
|
33
48
|
project: Google Cloud project ID
|
|
34
49
|
location: Google Cloud location
|
|
@@ -39,7 +54,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
39
54
|
|
|
40
55
|
# Initialize the model
|
|
41
56
|
model = GeminiModel(
|
|
42
|
-
|
|
57
|
+
model="gemini-1.5-pro-001",
|
|
43
58
|
api_key="your-api-key"
|
|
44
59
|
)
|
|
45
60
|
|
|
@@ -50,21 +65,19 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
50
65
|
|
|
51
66
|
def __init__(
|
|
52
67
|
self,
|
|
53
|
-
|
|
68
|
+
model: Optional[str] = None,
|
|
54
69
|
api_key: Optional[str] = None,
|
|
70
|
+
temperature: float = 0,
|
|
55
71
|
project: Optional[str] = None,
|
|
56
72
|
location: Optional[str] = None,
|
|
57
73
|
service_account_key: Optional[Dict[str, str]] = None,
|
|
58
|
-
temperature: float = 0,
|
|
59
74
|
generation_kwargs: Optional[Dict] = None,
|
|
60
75
|
**kwargs,
|
|
61
76
|
):
|
|
62
77
|
|
|
63
78
|
settings = get_settings()
|
|
64
79
|
|
|
65
|
-
|
|
66
|
-
model_name or settings.GEMINI_MODEL_NAME or default_gemini_model
|
|
67
|
-
)
|
|
80
|
+
model = model or settings.GEMINI_MODEL_NAME or default_gemini_model
|
|
68
81
|
|
|
69
82
|
# Get API key from settings if not provided
|
|
70
83
|
if api_key is not None:
|
|
@@ -98,27 +111,28 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
98
111
|
self.kwargs = kwargs
|
|
99
112
|
self.generation_kwargs = generation_kwargs or {}
|
|
100
113
|
|
|
114
|
+
self._module = self._require_module()
|
|
101
115
|
# Configure default model generation settings
|
|
102
116
|
self.model_safety_settings = [
|
|
103
|
-
types.SafetySetting(
|
|
104
|
-
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
|
105
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
117
|
+
self._module.types.SafetySetting(
|
|
118
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
|
119
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
106
120
|
),
|
|
107
|
-
types.SafetySetting(
|
|
108
|
-
category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
|
109
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
121
|
+
self._module.types.SafetySetting(
|
|
122
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
|
123
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
110
124
|
),
|
|
111
|
-
types.SafetySetting(
|
|
112
|
-
category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
|
113
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
125
|
+
self._module.types.SafetySetting(
|
|
126
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
|
127
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
114
128
|
),
|
|
115
|
-
types.SafetySetting(
|
|
116
|
-
category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
|
117
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
129
|
+
self._module.types.SafetySetting(
|
|
130
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
|
131
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
118
132
|
),
|
|
119
133
|
]
|
|
120
134
|
|
|
121
|
-
super().__init__(
|
|
135
|
+
super().__init__(model)
|
|
122
136
|
|
|
123
137
|
def should_use_vertexai(self) -> bool:
|
|
124
138
|
"""Checks if the model should use Vertex AI for generation.
|
|
@@ -137,6 +151,50 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
137
151
|
else:
|
|
138
152
|
return False
|
|
139
153
|
|
|
154
|
+
@retry_gemini
|
|
155
|
+
def generate_prompt(
|
|
156
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
157
|
+
) -> List[Union[str, MLLMImage]]:
|
|
158
|
+
"""Converts DeepEval multimodal input into GenAI SDK compatible format.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
multimodal_input: List of strings and MLLMImage objects
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of strings and PIL Image objects ready for model input
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
ValueError: If an invalid input type is provided
|
|
168
|
+
"""
|
|
169
|
+
prompt = []
|
|
170
|
+
settings = get_settings()
|
|
171
|
+
|
|
172
|
+
for ele in multimodal_input:
|
|
173
|
+
if isinstance(ele, str):
|
|
174
|
+
prompt.append(ele)
|
|
175
|
+
elif isinstance(ele, MLLMImage):
|
|
176
|
+
if ele.local:
|
|
177
|
+
with open(ele.url, "rb") as f:
|
|
178
|
+
image_data = f.read()
|
|
179
|
+
else:
|
|
180
|
+
response = requests.get(
|
|
181
|
+
ele.url,
|
|
182
|
+
timeout=(
|
|
183
|
+
settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
|
|
184
|
+
settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
|
|
185
|
+
),
|
|
186
|
+
)
|
|
187
|
+
response.raise_for_status()
|
|
188
|
+
image_data = response.content
|
|
189
|
+
|
|
190
|
+
image_part = self._module.types.Part.from_bytes(
|
|
191
|
+
data=image_data, mime_type="image/jpeg"
|
|
192
|
+
)
|
|
193
|
+
prompt.append(image_part)
|
|
194
|
+
else:
|
|
195
|
+
raise ValueError(f"Invalid input type: {type(ele)}")
|
|
196
|
+
return prompt
|
|
197
|
+
|
|
140
198
|
###############################################
|
|
141
199
|
# Generate functions
|
|
142
200
|
###############################################
|
|
@@ -154,11 +212,16 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
154
212
|
"""
|
|
155
213
|
client = self.load_model()
|
|
156
214
|
|
|
215
|
+
if check_if_multimodal(prompt):
|
|
216
|
+
|
|
217
|
+
prompt = convert_to_multi_modal_array(prompt)
|
|
218
|
+
prompt = self.generate_prompt(prompt)
|
|
219
|
+
|
|
157
220
|
if schema is not None:
|
|
158
221
|
response = client.models.generate_content(
|
|
159
|
-
model=self.
|
|
222
|
+
model=self.name,
|
|
160
223
|
contents=prompt,
|
|
161
|
-
config=types.GenerateContentConfig(
|
|
224
|
+
config=self._module.types.GenerateContentConfig(
|
|
162
225
|
response_mime_type="application/json",
|
|
163
226
|
response_schema=schema,
|
|
164
227
|
safety_settings=self.model_safety_settings,
|
|
@@ -169,9 +232,9 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
169
232
|
return response.parsed, 0
|
|
170
233
|
else:
|
|
171
234
|
response = client.models.generate_content(
|
|
172
|
-
model=self.
|
|
235
|
+
model=self.name,
|
|
173
236
|
contents=prompt,
|
|
174
|
-
config=types.GenerateContentConfig(
|
|
237
|
+
config=self._module.types.GenerateContentConfig(
|
|
175
238
|
safety_settings=self.model_safety_settings,
|
|
176
239
|
temperature=self.temperature,
|
|
177
240
|
**self.generation_kwargs,
|
|
@@ -194,11 +257,15 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
194
257
|
"""
|
|
195
258
|
client = self.load_model()
|
|
196
259
|
|
|
260
|
+
if check_if_multimodal(prompt):
|
|
261
|
+
prompt = convert_to_multi_modal_array(prompt)
|
|
262
|
+
prompt = self.generate_prompt(prompt)
|
|
263
|
+
|
|
197
264
|
if schema is not None:
|
|
198
265
|
response = await client.aio.models.generate_content(
|
|
199
|
-
model=self.
|
|
266
|
+
model=self.name,
|
|
200
267
|
contents=prompt,
|
|
201
|
-
config=types.GenerateContentConfig(
|
|
268
|
+
config=self._module.types.GenerateContentConfig(
|
|
202
269
|
response_mime_type="application/json",
|
|
203
270
|
response_schema=schema,
|
|
204
271
|
safety_settings=self.model_safety_settings,
|
|
@@ -209,9 +276,9 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
209
276
|
return response.parsed, 0
|
|
210
277
|
else:
|
|
211
278
|
response = await client.aio.models.generate_content(
|
|
212
|
-
model=self.
|
|
279
|
+
model=self.name,
|
|
213
280
|
contents=prompt,
|
|
214
|
-
config=types.GenerateContentConfig(
|
|
281
|
+
config=self._module.types.GenerateContentConfig(
|
|
215
282
|
safety_settings=self.model_safety_settings,
|
|
216
283
|
temperature=self.temperature,
|
|
217
284
|
**self.generation_kwargs,
|
|
@@ -223,11 +290,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
223
290
|
# Model #
|
|
224
291
|
#########
|
|
225
292
|
|
|
226
|
-
def
|
|
227
|
-
"""Returns the name of the Gemini model being used."""
|
|
228
|
-
return self.model_name
|
|
229
|
-
|
|
230
|
-
def load_model(self, *args, **kwargs):
|
|
293
|
+
def load_model(self):
|
|
231
294
|
"""Creates a client.
|
|
232
295
|
With Gen AI SDK, model is set at inference time, so there is no
|
|
233
296
|
model to load and initialize.
|
|
@@ -236,7 +299,21 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
236
299
|
Returns:
|
|
237
300
|
A GenerativeModel instance configured for evaluation.
|
|
238
301
|
"""
|
|
239
|
-
return self._build_client(
|
|
302
|
+
return self._build_client()
|
|
303
|
+
|
|
304
|
+
def _require_oauth2(self):
|
|
305
|
+
return require_dependency(
|
|
306
|
+
"google.oauth2",
|
|
307
|
+
provider_label="GeminiModel",
|
|
308
|
+
install_hint="Install it with `pip install google-auth`.",
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def _require_module(self):
|
|
312
|
+
return require_dependency(
|
|
313
|
+
"google.genai",
|
|
314
|
+
provider_label="GeminiModel",
|
|
315
|
+
install_hint="Install it with `pip install google-genai`.",
|
|
316
|
+
)
|
|
240
317
|
|
|
241
318
|
def _client_kwargs(self, **override_kwargs) -> Dict:
|
|
242
319
|
"""Merge ctor kwargs with any overrides passed at load_model time."""
|
|
@@ -245,8 +322,8 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
245
322
|
client_kwargs.update(override_kwargs)
|
|
246
323
|
return client_kwargs
|
|
247
324
|
|
|
248
|
-
def _build_client(self
|
|
249
|
-
client_kwargs = self._client_kwargs(**
|
|
325
|
+
def _build_client(self) -> "Client":
|
|
326
|
+
client_kwargs = self._client_kwargs(**self.kwargs)
|
|
250
327
|
|
|
251
328
|
if self.should_use_vertexai():
|
|
252
329
|
if not self.project or not self.location:
|
|
@@ -256,8 +333,9 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
256
333
|
"GOOGLE_CLOUD_LOCATION in your DeepEval configuration."
|
|
257
334
|
)
|
|
258
335
|
|
|
336
|
+
oauth2 = self._require_oauth2()
|
|
259
337
|
credentials = (
|
|
260
|
-
service_account.Credentials.from_service_account_info(
|
|
338
|
+
oauth2.service_account.Credentials.from_service_account_info(
|
|
261
339
|
self.service_account_key,
|
|
262
340
|
scopes=[
|
|
263
341
|
"https://www.googleapis.com/auth/cloud-platform",
|
|
@@ -267,7 +345,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
267
345
|
else None
|
|
268
346
|
)
|
|
269
347
|
|
|
270
|
-
client = Client(
|
|
348
|
+
client = self._module.Client(
|
|
271
349
|
vertexai=True,
|
|
272
350
|
project=self.project,
|
|
273
351
|
location=self.location,
|
|
@@ -282,6 +360,14 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
282
360
|
param_hint="`api_key` to GeminiModel(...)",
|
|
283
361
|
)
|
|
284
362
|
|
|
285
|
-
client = Client(api_key=api_key, **client_kwargs)
|
|
363
|
+
client = self._module.Client(api_key=api_key, **client_kwargs)
|
|
286
364
|
|
|
287
365
|
return client
|
|
366
|
+
|
|
367
|
+
def supports_multimodal(self):
|
|
368
|
+
if self.name in valid_multimodal_models:
|
|
369
|
+
return True
|
|
370
|
+
return False
|
|
371
|
+
|
|
372
|
+
def get_model_name(self):
|
|
373
|
+
return f"{self.name} (Gemini)"
|
|
@@ -7,11 +7,12 @@ from deepeval.models.retry_policy import (
|
|
|
7
7
|
sdk_retries_for,
|
|
8
8
|
)
|
|
9
9
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
10
|
-
from deepeval.models.utils import
|
|
10
|
+
from deepeval.models.utils import (
|
|
11
|
+
require_secret_api_key,
|
|
12
|
+
)
|
|
11
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
12
14
|
from deepeval.constants import ProviderSlug as PS
|
|
13
15
|
|
|
14
|
-
|
|
15
16
|
# consistent retry rules
|
|
16
17
|
retry_grok = create_retry_decorator(PS.GROK)
|
|
17
18
|
|
|
@@ -61,11 +62,12 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
61
62
|
generation_kwargs: Optional[Dict] = None,
|
|
62
63
|
**kwargs,
|
|
63
64
|
):
|
|
65
|
+
|
|
64
66
|
settings = get_settings()
|
|
65
67
|
|
|
66
|
-
|
|
68
|
+
model = model or settings.GROK_MODEL_NAME
|
|
67
69
|
|
|
68
|
-
if
|
|
70
|
+
if model not in model_pricing:
|
|
69
71
|
raise ValueError(
|
|
70
72
|
f"Invalid model. Available Grok models: {', '.join(model_pricing.keys())}"
|
|
71
73
|
)
|
|
@@ -83,9 +85,10 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
83
85
|
else:
|
|
84
86
|
self.api_key = settings.GROK_API_KEY
|
|
85
87
|
|
|
88
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
86
89
|
self.kwargs = kwargs
|
|
87
90
|
self.generation_kwargs = generation_kwargs or {}
|
|
88
|
-
super().__init__(
|
|
91
|
+
super().__init__(model)
|
|
89
92
|
|
|
90
93
|
###############################################
|
|
91
94
|
# Other generate functions
|
|
@@ -95,6 +98,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
95
98
|
def generate(
|
|
96
99
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
97
100
|
) -> Tuple[Union[str, Dict], float]:
|
|
101
|
+
|
|
98
102
|
try:
|
|
99
103
|
from xai_sdk.chat import user
|
|
100
104
|
except ImportError:
|
|
@@ -103,13 +107,13 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
103
107
|
)
|
|
104
108
|
client = self.load_model(async_mode=False)
|
|
105
109
|
chat = client.chat.create(
|
|
106
|
-
model=self.
|
|
110
|
+
model=self.name,
|
|
107
111
|
temperature=self.temperature,
|
|
108
112
|
**self.generation_kwargs,
|
|
109
113
|
)
|
|
110
114
|
chat.append(user(prompt))
|
|
111
115
|
|
|
112
|
-
if schema and self.
|
|
116
|
+
if schema and self.name in structured_outputs_models:
|
|
113
117
|
response, structured_output = chat.parse(schema)
|
|
114
118
|
cost = self.calculate_cost(
|
|
115
119
|
response.usage.prompt_tokens,
|
|
@@ -133,6 +137,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
133
137
|
async def a_generate(
|
|
134
138
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
135
139
|
) -> Tuple[Union[str, Dict], float]:
|
|
140
|
+
|
|
136
141
|
try:
|
|
137
142
|
from xai_sdk.chat import user
|
|
138
143
|
except ImportError:
|
|
@@ -141,13 +146,13 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
141
146
|
)
|
|
142
147
|
client = self.load_model(async_mode=True)
|
|
143
148
|
chat = client.chat.create(
|
|
144
|
-
model=self.
|
|
149
|
+
model=self.name,
|
|
145
150
|
temperature=self.temperature,
|
|
146
151
|
**self.generation_kwargs,
|
|
147
152
|
)
|
|
148
153
|
chat.append(user(prompt))
|
|
149
154
|
|
|
150
|
-
if schema and self.
|
|
155
|
+
if schema and self.name in structured_outputs_models:
|
|
151
156
|
response, structured_output = await chat.parse(schema)
|
|
152
157
|
cost = self.calculate_cost(
|
|
153
158
|
response.usage.prompt_tokens,
|
|
@@ -176,7 +181,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
176
181
|
input_tokens: int,
|
|
177
182
|
output_tokens: int,
|
|
178
183
|
) -> float:
|
|
179
|
-
pricing = model_pricing.get(self.
|
|
184
|
+
pricing = model_pricing.get(self.name, model_pricing)
|
|
180
185
|
input_cost = input_tokens * pricing["input"]
|
|
181
186
|
output_cost = output_tokens * pricing["output"]
|
|
182
187
|
return input_cost + output_cost
|
|
@@ -198,9 +203,6 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
198
203
|
"xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
|
|
199
204
|
)
|
|
200
205
|
|
|
201
|
-
def get_model_name(self):
|
|
202
|
-
return f"{self.model_name}"
|
|
203
|
-
|
|
204
206
|
def _client_kwargs(self) -> Dict:
|
|
205
207
|
"""
|
|
206
208
|
If Tenacity is managing retries, disable gRPC channel retries to avoid double retry.
|
|
@@ -242,3 +244,6 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
242
244
|
kw.pop("channel_options", None)
|
|
243
245
|
return cls(**kw)
|
|
244
246
|
raise
|
|
247
|
+
|
|
248
|
+
def get_model_name(self):
|
|
249
|
+
return f"{self.name} (Grok)"
|
|
@@ -8,7 +8,9 @@ from deepeval.models.retry_policy import (
|
|
|
8
8
|
sdk_retries_for,
|
|
9
9
|
)
|
|
10
10
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
11
|
-
from deepeval.models.utils import
|
|
11
|
+
from deepeval.models.utils import (
|
|
12
|
+
require_secret_api_key,
|
|
13
|
+
)
|
|
12
14
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
15
|
from deepeval.constants import ProviderSlug as PS
|
|
14
16
|
|
|
@@ -74,16 +76,16 @@ model_pricing = {
|
|
|
74
76
|
class KimiModel(DeepEvalBaseLLM):
|
|
75
77
|
def __init__(
|
|
76
78
|
self,
|
|
77
|
-
api_key: Optional[str] = None,
|
|
78
79
|
model: Optional[str] = None,
|
|
80
|
+
api_key: Optional[str] = None,
|
|
79
81
|
temperature: float = 0,
|
|
80
82
|
generation_kwargs: Optional[Dict] = None,
|
|
81
83
|
**kwargs,
|
|
82
84
|
):
|
|
83
85
|
settings = get_settings()
|
|
84
86
|
|
|
85
|
-
|
|
86
|
-
if
|
|
87
|
+
model = model or settings.MOONSHOT_MODEL_NAME
|
|
88
|
+
if model not in model_pricing:
|
|
87
89
|
raise ValueError(
|
|
88
90
|
f"Invalid model. Available Moonshot models: {', '.join(model_pricing.keys())}"
|
|
89
91
|
)
|
|
@@ -103,9 +105,10 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
103
105
|
self.api_key = settings.MOONSHOT_API_KEY
|
|
104
106
|
|
|
105
107
|
self.base_url = "https://api.moonshot.cn/v1"
|
|
108
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
106
109
|
self.kwargs = kwargs
|
|
107
110
|
self.generation_kwargs = generation_kwargs or {}
|
|
108
|
-
super().__init__(
|
|
111
|
+
super().__init__(model)
|
|
109
112
|
|
|
110
113
|
###############################################
|
|
111
114
|
# Other generate functions
|
|
@@ -115,10 +118,11 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
115
118
|
def generate(
|
|
116
119
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
117
120
|
) -> Tuple[Union[str, Dict], float]:
|
|
121
|
+
|
|
118
122
|
client = self.load_model(async_mode=False)
|
|
119
|
-
if schema and self.
|
|
123
|
+
if schema and self.name in json_mode_models:
|
|
120
124
|
completion = client.chat.completions.create(
|
|
121
|
-
model=self.
|
|
125
|
+
model=self.name,
|
|
122
126
|
messages=[{"role": "user", "content": prompt}],
|
|
123
127
|
response_format={"type": "json_object"},
|
|
124
128
|
temperature=self.temperature,
|
|
@@ -134,7 +138,7 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
134
138
|
return schema.model_validate(json_output), cost
|
|
135
139
|
|
|
136
140
|
completion = client.chat.completions.create(
|
|
137
|
-
model=self.
|
|
141
|
+
model=self.name,
|
|
138
142
|
messages=[{"role": "user", "content": prompt}],
|
|
139
143
|
**self.generation_kwargs,
|
|
140
144
|
)
|
|
@@ -153,10 +157,11 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
153
157
|
async def a_generate(
|
|
154
158
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
155
159
|
) -> Tuple[Union[str, Dict], float]:
|
|
160
|
+
|
|
156
161
|
client = self.load_model(async_mode=True)
|
|
157
|
-
if schema and self.
|
|
162
|
+
if schema and self.name in json_mode_models:
|
|
158
163
|
completion = await client.chat.completions.create(
|
|
159
|
-
model=self.
|
|
164
|
+
model=self.name,
|
|
160
165
|
messages=[{"role": "user", "content": prompt}],
|
|
161
166
|
response_format={"type": "json_object"},
|
|
162
167
|
temperature=self.temperature,
|
|
@@ -172,7 +177,7 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
172
177
|
return schema.model_validate(json_output), cost
|
|
173
178
|
|
|
174
179
|
completion = await client.chat.completions.create(
|
|
175
|
-
model=self.
|
|
180
|
+
model=self.name,
|
|
176
181
|
messages=[{"role": "user", "content": prompt}],
|
|
177
182
|
**self.generation_kwargs,
|
|
178
183
|
)
|
|
@@ -196,7 +201,7 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
196
201
|
input_tokens: int,
|
|
197
202
|
output_tokens: int,
|
|
198
203
|
) -> float:
|
|
199
|
-
pricing = model_pricing.get(self.
|
|
204
|
+
pricing = model_pricing.get(self.name, model_pricing)
|
|
200
205
|
input_cost = input_tokens * pricing["input"]
|
|
201
206
|
output_cost = output_tokens * pricing["output"]
|
|
202
207
|
return input_cost + output_cost
|
|
@@ -244,4 +249,4 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
244
249
|
raise
|
|
245
250
|
|
|
246
251
|
def get_model_name(self):
|
|
247
|
-
return f"{self.
|
|
252
|
+
return f"{self.name} (KIMI)"
|