deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,15 +1,32 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
from
|
|
1
|
+
import json
|
|
2
|
+
import requests
|
|
3
|
+
from pydantic import BaseModel, SecretStr
|
|
4
|
+
from typing import TYPE_CHECKING, Optional, Dict, List, Union
|
|
4
5
|
|
|
6
|
+
from deepeval.test_case import MLLMImage
|
|
7
|
+
from deepeval.config.settings import get_settings
|
|
8
|
+
from deepeval.models.utils import require_secret_api_key
|
|
5
9
|
from deepeval.models.retry_policy import (
|
|
6
10
|
create_retry_decorator,
|
|
7
11
|
)
|
|
8
|
-
from deepeval.
|
|
12
|
+
from deepeval.utils import (
|
|
13
|
+
convert_to_multi_modal_array,
|
|
14
|
+
check_if_multimodal,
|
|
15
|
+
require_dependency,
|
|
16
|
+
)
|
|
9
17
|
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
10
18
|
from deepeval.constants import ProviderSlug as PS
|
|
11
|
-
|
|
12
|
-
|
|
19
|
+
|
|
20
|
+
valid_multimodal_models = [
|
|
21
|
+
"gemini-2.5-pro",
|
|
22
|
+
"gemini-2.5-flash",
|
|
23
|
+
"gemini-1.5-pro",
|
|
24
|
+
"gemini-1.5-flash",
|
|
25
|
+
# TODO: Add more models later
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from google.genai import Client
|
|
13
30
|
|
|
14
31
|
default_gemini_model = "gemini-1.5-pro"
|
|
15
32
|
|
|
@@ -26,7 +43,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
26
43
|
To use Vertex AI API, set project and location attributes.
|
|
27
44
|
|
|
28
45
|
Attributes:
|
|
29
|
-
|
|
46
|
+
model: Name of the Gemini model to use
|
|
30
47
|
api_key: Google API key for authentication
|
|
31
48
|
project: Google Cloud project ID
|
|
32
49
|
location: Google Cloud location
|
|
@@ -37,7 +54,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
37
54
|
|
|
38
55
|
# Initialize the model
|
|
39
56
|
model = GeminiModel(
|
|
40
|
-
|
|
57
|
+
model="gemini-1.5-pro-001",
|
|
41
58
|
api_key="your-api-key"
|
|
42
59
|
)
|
|
43
60
|
|
|
@@ -48,40 +65,39 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
48
65
|
|
|
49
66
|
def __init__(
|
|
50
67
|
self,
|
|
51
|
-
|
|
68
|
+
model: Optional[str] = None,
|
|
52
69
|
api_key: Optional[str] = None,
|
|
70
|
+
temperature: float = 0,
|
|
53
71
|
project: Optional[str] = None,
|
|
54
72
|
location: Optional[str] = None,
|
|
55
73
|
service_account_key: Optional[Dict[str, str]] = None,
|
|
56
|
-
temperature: float = 0,
|
|
57
74
|
generation_kwargs: Optional[Dict] = None,
|
|
58
75
|
**kwargs,
|
|
59
76
|
):
|
|
60
|
-
model_name = (
|
|
61
|
-
model_name
|
|
62
|
-
or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.GEMINI_MODEL_NAME)
|
|
63
|
-
or default_gemini_model
|
|
64
|
-
)
|
|
65
77
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
+
settings = get_settings()
|
|
79
|
+
|
|
80
|
+
model = model or settings.GEMINI_MODEL_NAME or default_gemini_model
|
|
81
|
+
|
|
82
|
+
# Get API key from settings if not provided
|
|
83
|
+
if api_key is not None:
|
|
84
|
+
# keep it secret, keep it safe from serializings, logging and aolike
|
|
85
|
+
self.api_key: SecretStr | None = SecretStr(api_key)
|
|
86
|
+
else:
|
|
87
|
+
self.api_key = settings.GOOGLE_API_KEY
|
|
88
|
+
|
|
89
|
+
self.project = project or settings.GOOGLE_CLOUD_PROJECT
|
|
90
|
+
self.location = (
|
|
91
|
+
location
|
|
92
|
+
or settings.GOOGLE_CLOUD_LOCATION is not None
|
|
93
|
+
and str(settings.GOOGLE_CLOUD_LOCATION)
|
|
78
94
|
)
|
|
95
|
+
self.use_vertexai = settings.GOOGLE_GENAI_USE_VERTEXAI
|
|
96
|
+
|
|
79
97
|
if service_account_key:
|
|
80
98
|
self.service_account_key = service_account_key
|
|
81
99
|
else:
|
|
82
|
-
service_account_key_data =
|
|
83
|
-
ModelKeyValues.GOOGLE_SERVICE_ACCOUNT_KEY
|
|
84
|
-
)
|
|
100
|
+
service_account_key_data = settings.GOOGLE_SERVICE_ACCOUNT_KEY
|
|
85
101
|
if service_account_key_data is None:
|
|
86
102
|
self.service_account_key = None
|
|
87
103
|
elif isinstance(service_account_key_data, str):
|
|
@@ -90,11 +106,35 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
90
106
|
if temperature < 0:
|
|
91
107
|
raise ValueError("Temperature must be >= 0.")
|
|
92
108
|
self.temperature = temperature
|
|
109
|
+
|
|
110
|
+
# Raw kwargs destined for the underlying Client
|
|
93
111
|
self.kwargs = kwargs
|
|
94
112
|
self.generation_kwargs = generation_kwargs or {}
|
|
95
|
-
super().__init__(model_name, **kwargs)
|
|
96
113
|
|
|
97
|
-
|
|
114
|
+
self._module = self._require_module()
|
|
115
|
+
# Configure default model generation settings
|
|
116
|
+
self.model_safety_settings = [
|
|
117
|
+
self._module.types.SafetySetting(
|
|
118
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
|
119
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
120
|
+
),
|
|
121
|
+
self._module.types.SafetySetting(
|
|
122
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
|
123
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
124
|
+
),
|
|
125
|
+
self._module.types.SafetySetting(
|
|
126
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
|
127
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
128
|
+
),
|
|
129
|
+
self._module.types.SafetySetting(
|
|
130
|
+
category=self._module.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
|
131
|
+
threshold=self._module.types.HarmBlockThreshold.BLOCK_NONE,
|
|
132
|
+
),
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
super().__init__(model)
|
|
136
|
+
|
|
137
|
+
def should_use_vertexai(self) -> bool:
|
|
98
138
|
"""Checks if the model should use Vertex AI for generation.
|
|
99
139
|
|
|
100
140
|
This is determined first by the value of `GOOGLE_GENAI_USE_VERTEXAI`
|
|
@@ -111,69 +151,53 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
111
151
|
else:
|
|
112
152
|
return False
|
|
113
153
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
154
|
+
@retry_gemini
|
|
155
|
+
def generate_prompt(
|
|
156
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
157
|
+
) -> List[Union[str, MLLMImage]]:
|
|
158
|
+
"""Converts DeepEval multimodal input into GenAI SDK compatible format.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
multimodal_input: List of strings and MLLMImage objects
|
|
119
162
|
|
|
120
163
|
Returns:
|
|
121
|
-
|
|
164
|
+
List of strings and PIL Image objects ready for model input
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
ValueError: If an invalid input type is provided
|
|
122
168
|
"""
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
raise ValueError(
|
|
126
|
-
"When using Vertex AI API, both project and location are required."
|
|
127
|
-
"Either provide them as arguments or set GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables, "
|
|
128
|
-
"or set them in your DeepEval configuration."
|
|
129
|
-
)
|
|
169
|
+
prompt = []
|
|
170
|
+
settings = get_settings()
|
|
130
171
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
172
|
+
for ele in multimodal_input:
|
|
173
|
+
if isinstance(ele, str):
|
|
174
|
+
prompt.append(ele)
|
|
175
|
+
elif isinstance(ele, MLLMImage):
|
|
176
|
+
if ele.local:
|
|
177
|
+
with open(ele.url, "rb") as f:
|
|
178
|
+
image_data = f.read()
|
|
179
|
+
else:
|
|
180
|
+
response = requests.get(
|
|
181
|
+
ele.url,
|
|
182
|
+
timeout=(
|
|
183
|
+
settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
|
|
184
|
+
settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
|
|
185
|
+
),
|
|
142
186
|
)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
else:
|
|
149
|
-
if not self.api_key:
|
|
150
|
-
raise ValueError(
|
|
151
|
-
"Google API key is required. Either provide it directly, set GOOGLE_API_KEY environment variable, "
|
|
152
|
-
"or set it in your DeepEval configuration."
|
|
187
|
+
response.raise_for_status()
|
|
188
|
+
image_data = response.content
|
|
189
|
+
|
|
190
|
+
image_part = self._module.types.Part.from_bytes(
|
|
191
|
+
data=image_data, mime_type="image/jpeg"
|
|
153
192
|
)
|
|
154
|
-
|
|
155
|
-
|
|
193
|
+
prompt.append(image_part)
|
|
194
|
+
else:
|
|
195
|
+
raise ValueError(f"Invalid input type: {type(ele)}")
|
|
196
|
+
return prompt
|
|
156
197
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
|
161
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
162
|
-
),
|
|
163
|
-
types.SafetySetting(
|
|
164
|
-
category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
|
165
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
166
|
-
),
|
|
167
|
-
types.SafetySetting(
|
|
168
|
-
category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
|
169
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
170
|
-
),
|
|
171
|
-
types.SafetySetting(
|
|
172
|
-
category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
|
173
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
174
|
-
),
|
|
175
|
-
]
|
|
176
|
-
return self.client.models
|
|
198
|
+
###############################################
|
|
199
|
+
# Generate functions
|
|
200
|
+
###############################################
|
|
177
201
|
|
|
178
202
|
@retry_gemini
|
|
179
203
|
def generate(self, prompt: str, schema: Optional[BaseModel] = None) -> str:
|
|
@@ -186,11 +210,18 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
186
210
|
Returns:
|
|
187
211
|
Generated text response or structured output as Pydantic model
|
|
188
212
|
"""
|
|
213
|
+
client = self.load_model()
|
|
214
|
+
|
|
215
|
+
if check_if_multimodal(prompt):
|
|
216
|
+
|
|
217
|
+
prompt = convert_to_multi_modal_array(prompt)
|
|
218
|
+
prompt = self.generate_prompt(prompt)
|
|
219
|
+
|
|
189
220
|
if schema is not None:
|
|
190
|
-
response =
|
|
191
|
-
model=self.
|
|
221
|
+
response = client.models.generate_content(
|
|
222
|
+
model=self.name,
|
|
192
223
|
contents=prompt,
|
|
193
|
-
config=types.GenerateContentConfig(
|
|
224
|
+
config=self._module.types.GenerateContentConfig(
|
|
194
225
|
response_mime_type="application/json",
|
|
195
226
|
response_schema=schema,
|
|
196
227
|
safety_settings=self.model_safety_settings,
|
|
@@ -200,10 +231,10 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
200
231
|
)
|
|
201
232
|
return response.parsed, 0
|
|
202
233
|
else:
|
|
203
|
-
response =
|
|
204
|
-
model=self.
|
|
234
|
+
response = client.models.generate_content(
|
|
235
|
+
model=self.name,
|
|
205
236
|
contents=prompt,
|
|
206
|
-
config=types.GenerateContentConfig(
|
|
237
|
+
config=self._module.types.GenerateContentConfig(
|
|
207
238
|
safety_settings=self.model_safety_settings,
|
|
208
239
|
temperature=self.temperature,
|
|
209
240
|
**self.generation_kwargs,
|
|
@@ -224,11 +255,17 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
224
255
|
Returns:
|
|
225
256
|
Generated text response or structured output as Pydantic model
|
|
226
257
|
"""
|
|
258
|
+
client = self.load_model()
|
|
259
|
+
|
|
260
|
+
if check_if_multimodal(prompt):
|
|
261
|
+
prompt = convert_to_multi_modal_array(prompt)
|
|
262
|
+
prompt = self.generate_prompt(prompt)
|
|
263
|
+
|
|
227
264
|
if schema is not None:
|
|
228
|
-
response = await
|
|
229
|
-
model=self.
|
|
265
|
+
response = await client.aio.models.generate_content(
|
|
266
|
+
model=self.name,
|
|
230
267
|
contents=prompt,
|
|
231
|
-
config=types.GenerateContentConfig(
|
|
268
|
+
config=self._module.types.GenerateContentConfig(
|
|
232
269
|
response_mime_type="application/json",
|
|
233
270
|
response_schema=schema,
|
|
234
271
|
safety_settings=self.model_safety_settings,
|
|
@@ -238,10 +275,10 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
238
275
|
)
|
|
239
276
|
return response.parsed, 0
|
|
240
277
|
else:
|
|
241
|
-
response = await
|
|
242
|
-
model=self.
|
|
278
|
+
response = await client.aio.models.generate_content(
|
|
279
|
+
model=self.name,
|
|
243
280
|
contents=prompt,
|
|
244
|
-
config=types.GenerateContentConfig(
|
|
281
|
+
config=self._module.types.GenerateContentConfig(
|
|
245
282
|
safety_settings=self.model_safety_settings,
|
|
246
283
|
temperature=self.temperature,
|
|
247
284
|
**self.generation_kwargs,
|
|
@@ -249,6 +286,88 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
249
286
|
)
|
|
250
287
|
return response.text, 0
|
|
251
288
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
289
|
+
#########
|
|
290
|
+
# Model #
|
|
291
|
+
#########
|
|
292
|
+
|
|
293
|
+
def load_model(self):
|
|
294
|
+
"""Creates a client.
|
|
295
|
+
With Gen AI SDK, model is set at inference time, so there is no
|
|
296
|
+
model to load and initialize.
|
|
297
|
+
This method name is kept for compatibility with other LLMs.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
A GenerativeModel instance configured for evaluation.
|
|
301
|
+
"""
|
|
302
|
+
return self._build_client()
|
|
303
|
+
|
|
304
|
+
def _require_oauth2(self):
|
|
305
|
+
return require_dependency(
|
|
306
|
+
"google.oauth2",
|
|
307
|
+
provider_label="GeminiModel",
|
|
308
|
+
install_hint="Install it with `pip install google-auth`.",
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def _require_module(self):
|
|
312
|
+
return require_dependency(
|
|
313
|
+
"google.genai",
|
|
314
|
+
provider_label="GeminiModel",
|
|
315
|
+
install_hint="Install it with `pip install google-genai`.",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def _client_kwargs(self, **override_kwargs) -> Dict:
|
|
319
|
+
"""Merge ctor kwargs with any overrides passed at load_model time."""
|
|
320
|
+
client_kwargs = dict(self.kwargs or {})
|
|
321
|
+
if override_kwargs:
|
|
322
|
+
client_kwargs.update(override_kwargs)
|
|
323
|
+
return client_kwargs
|
|
324
|
+
|
|
325
|
+
def _build_client(self) -> "Client":
|
|
326
|
+
client_kwargs = self._client_kwargs(**self.kwargs)
|
|
327
|
+
|
|
328
|
+
if self.should_use_vertexai():
|
|
329
|
+
if not self.project or not self.location:
|
|
330
|
+
raise ValueError(
|
|
331
|
+
"When using Vertex AI API, both project and location are required. "
|
|
332
|
+
"Either provide them as arguments or set GOOGLE_CLOUD_PROJECT and "
|
|
333
|
+
"GOOGLE_CLOUD_LOCATION in your DeepEval configuration."
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
oauth2 = self._require_oauth2()
|
|
337
|
+
credentials = (
|
|
338
|
+
oauth2.service_account.Credentials.from_service_account_info(
|
|
339
|
+
self.service_account_key,
|
|
340
|
+
scopes=[
|
|
341
|
+
"https://www.googleapis.com/auth/cloud-platform",
|
|
342
|
+
],
|
|
343
|
+
)
|
|
344
|
+
if self.service_account_key
|
|
345
|
+
else None
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
client = self._module.Client(
|
|
349
|
+
vertexai=True,
|
|
350
|
+
project=self.project,
|
|
351
|
+
location=self.location,
|
|
352
|
+
credentials=credentials,
|
|
353
|
+
**client_kwargs,
|
|
354
|
+
)
|
|
355
|
+
else:
|
|
356
|
+
api_key = require_secret_api_key(
|
|
357
|
+
self.api_key,
|
|
358
|
+
provider_label="Google Gemini",
|
|
359
|
+
env_var_name="GOOGLE_API_KEY",
|
|
360
|
+
param_hint="`api_key` to GeminiModel(...)",
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
client = self._module.Client(api_key=api_key, **client_kwargs)
|
|
364
|
+
|
|
365
|
+
return client
|
|
366
|
+
|
|
367
|
+
def supports_multimodal(self):
|
|
368
|
+
if self.name in valid_multimodal_models:
|
|
369
|
+
return True
|
|
370
|
+
return False
|
|
371
|
+
|
|
372
|
+
def get_model_name(self):
|
|
373
|
+
return f"{self.name} (Gemini)"
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
1
|
from typing import Optional, Tuple, Union, Dict
|
|
4
|
-
from pydantic import BaseModel
|
|
2
|
+
from pydantic import BaseModel, SecretStr
|
|
5
3
|
|
|
4
|
+
from deepeval.config.settings import get_settings
|
|
6
5
|
from deepeval.models.retry_policy import (
|
|
7
6
|
create_retry_decorator,
|
|
8
7
|
sdk_retries_for,
|
|
9
8
|
)
|
|
10
|
-
from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
|
|
11
9
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
10
|
+
from deepeval.models.utils import (
|
|
11
|
+
require_secret_api_key,
|
|
12
|
+
)
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.constants import ProviderSlug as PS
|
|
14
15
|
|
|
15
|
-
|
|
16
16
|
# consistent retry rules
|
|
17
17
|
retry_grok = create_retry_decorator(PS.GROK)
|
|
18
18
|
|
|
@@ -62,30 +62,33 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
62
62
|
generation_kwargs: Optional[Dict] = None,
|
|
63
63
|
**kwargs,
|
|
64
64
|
):
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
|
|
66
|
+
settings = get_settings()
|
|
67
|
+
|
|
68
|
+
model = model or settings.GROK_MODEL_NAME
|
|
69
|
+
|
|
70
|
+
if model not in model_pricing:
|
|
69
71
|
raise ValueError(
|
|
70
72
|
f"Invalid model. Available Grok models: {', '.join(model_pricing.keys())}"
|
|
71
73
|
)
|
|
72
|
-
temperature_from_key =
|
|
73
|
-
ModelKeyValues.TEMPERATURE
|
|
74
|
-
)
|
|
74
|
+
temperature_from_key = settings.TEMPERATURE
|
|
75
75
|
if temperature_from_key is None:
|
|
76
76
|
self.temperature = temperature
|
|
77
77
|
else:
|
|
78
78
|
self.temperature = float(temperature_from_key)
|
|
79
79
|
if self.temperature < 0:
|
|
80
80
|
raise ValueError("Temperature must be >= 0.")
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
|
|
82
|
+
if api_key is not None:
|
|
83
|
+
# keep it secret, keep it safe from serializings, logging and alike
|
|
84
|
+
self.api_key: SecretStr | None = SecretStr(api_key)
|
|
85
|
+
else:
|
|
86
|
+
self.api_key = settings.GROK_API_KEY
|
|
87
|
+
|
|
88
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
86
89
|
self.kwargs = kwargs
|
|
87
90
|
self.generation_kwargs = generation_kwargs or {}
|
|
88
|
-
super().__init__(
|
|
91
|
+
super().__init__(model)
|
|
89
92
|
|
|
90
93
|
###############################################
|
|
91
94
|
# Other generate functions
|
|
@@ -95,6 +98,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
95
98
|
def generate(
|
|
96
99
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
97
100
|
) -> Tuple[Union[str, Dict], float]:
|
|
101
|
+
|
|
98
102
|
try:
|
|
99
103
|
from xai_sdk.chat import user
|
|
100
104
|
except ImportError:
|
|
@@ -103,13 +107,13 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
103
107
|
)
|
|
104
108
|
client = self.load_model(async_mode=False)
|
|
105
109
|
chat = client.chat.create(
|
|
106
|
-
model=self.
|
|
110
|
+
model=self.name,
|
|
107
111
|
temperature=self.temperature,
|
|
108
112
|
**self.generation_kwargs,
|
|
109
113
|
)
|
|
110
114
|
chat.append(user(prompt))
|
|
111
115
|
|
|
112
|
-
if schema and self.
|
|
116
|
+
if schema and self.name in structured_outputs_models:
|
|
113
117
|
response, structured_output = chat.parse(schema)
|
|
114
118
|
cost = self.calculate_cost(
|
|
115
119
|
response.usage.prompt_tokens,
|
|
@@ -133,6 +137,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
133
137
|
async def a_generate(
|
|
134
138
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
135
139
|
) -> Tuple[Union[str, Dict], float]:
|
|
140
|
+
|
|
136
141
|
try:
|
|
137
142
|
from xai_sdk.chat import user
|
|
138
143
|
except ImportError:
|
|
@@ -141,13 +146,13 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
141
146
|
)
|
|
142
147
|
client = self.load_model(async_mode=True)
|
|
143
148
|
chat = client.chat.create(
|
|
144
|
-
model=self.
|
|
149
|
+
model=self.name,
|
|
145
150
|
temperature=self.temperature,
|
|
146
151
|
**self.generation_kwargs,
|
|
147
152
|
)
|
|
148
153
|
chat.append(user(prompt))
|
|
149
154
|
|
|
150
|
-
if schema and self.
|
|
155
|
+
if schema and self.name in structured_outputs_models:
|
|
151
156
|
response, structured_output = await chat.parse(schema)
|
|
152
157
|
cost = self.calculate_cost(
|
|
153
158
|
response.usage.prompt_tokens,
|
|
@@ -176,7 +181,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
176
181
|
input_tokens: int,
|
|
177
182
|
output_tokens: int,
|
|
178
183
|
) -> float:
|
|
179
|
-
pricing = model_pricing.get(self.
|
|
184
|
+
pricing = model_pricing.get(self.name, model_pricing)
|
|
180
185
|
input_cost = input_tokens * pricing["input"]
|
|
181
186
|
output_cost = output_tokens * pricing["output"]
|
|
182
187
|
return input_cost + output_cost
|
|
@@ -198,9 +203,6 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
198
203
|
"xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
|
|
199
204
|
)
|
|
200
205
|
|
|
201
|
-
def get_model_name(self):
|
|
202
|
-
return f"{self.model_name}"
|
|
203
|
-
|
|
204
206
|
def _client_kwargs(self) -> Dict:
|
|
205
207
|
"""
|
|
206
208
|
If Tenacity is managing retries, disable gRPC channel retries to avoid double retry.
|
|
@@ -226,7 +228,14 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
226
228
|
return kwargs
|
|
227
229
|
|
|
228
230
|
def _build_client(self, cls):
|
|
229
|
-
|
|
231
|
+
api_key = require_secret_api_key(
|
|
232
|
+
self.api_key,
|
|
233
|
+
provider_label="Grok",
|
|
234
|
+
env_var_name="GROK_API_KEY",
|
|
235
|
+
param_hint="`api_key` to GrokModel(...)",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
kw = dict(api_key=api_key, **self._client_kwargs())
|
|
230
239
|
try:
|
|
231
240
|
return cls(**kw)
|
|
232
241
|
except TypeError as e:
|
|
@@ -235,3 +244,6 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
235
244
|
kw.pop("channel_options", None)
|
|
236
245
|
return cls(**kw)
|
|
237
246
|
raise
|
|
247
|
+
|
|
248
|
+
def get_model_name(self):
|
|
249
|
+
return f"{self.name} (Grok)"
|