deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import Optional, Tuple, Union, Dict
|
|
1
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
2
2
|
from pydantic import BaseModel, SecretStr
|
|
3
3
|
|
|
4
|
+
from deepeval.errors import DeepEvalError
|
|
4
5
|
from deepeval.config.settings import get_settings
|
|
5
6
|
from deepeval.models.retry_policy import (
|
|
6
7
|
create_retry_decorator,
|
|
@@ -8,57 +9,28 @@ from deepeval.models.retry_policy import (
|
|
|
8
9
|
)
|
|
9
10
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
10
11
|
from deepeval.models.utils import (
|
|
12
|
+
require_costs,
|
|
11
13
|
require_secret_api_key,
|
|
12
14
|
)
|
|
15
|
+
from deepeval.test_case import MLLMImage
|
|
16
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
13
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
14
18
|
from deepeval.constants import ProviderSlug as PS
|
|
19
|
+
from deepeval.models.llms.constants import GROK_MODELS_DATA
|
|
20
|
+
from deepeval.utils import require_param
|
|
15
21
|
|
|
16
22
|
# consistent retry rules
|
|
17
23
|
retry_grok = create_retry_decorator(PS.GROK)
|
|
18
24
|
|
|
19
25
|
|
|
20
|
-
structured_outputs_models = [
|
|
21
|
-
"grok-4-0709",
|
|
22
|
-
"grok-3",
|
|
23
|
-
"grok-3-mini",
|
|
24
|
-
"grok-3-fast",
|
|
25
|
-
"grok-3-mini-fast",
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
model_pricing = {
|
|
29
|
-
"grok-4-0709": {
|
|
30
|
-
"input": 0.20 / 1e6,
|
|
31
|
-
"output": 2.00 / 1e6,
|
|
32
|
-
},
|
|
33
|
-
"grok-3": {
|
|
34
|
-
"input": 1.00 / 1e6,
|
|
35
|
-
"output": 3.00 / 1e6,
|
|
36
|
-
},
|
|
37
|
-
"grok-3-mini": {
|
|
38
|
-
"input": 2.00 / 1e6,
|
|
39
|
-
"output": 5.00 / 1e6,
|
|
40
|
-
},
|
|
41
|
-
"grok-3-fast": {
|
|
42
|
-
"input": 0.60 / 1e6,
|
|
43
|
-
"output": 2.50 / 1e6,
|
|
44
|
-
},
|
|
45
|
-
"grok-3-mini-fast": {
|
|
46
|
-
"input": 30 / 1e6,
|
|
47
|
-
"output": 30 / 1e6,
|
|
48
|
-
},
|
|
49
|
-
"grok-2-vision-1212": {
|
|
50
|
-
"input": 1.00 / 1e6,
|
|
51
|
-
"output": 2.00 / 1e6,
|
|
52
|
-
},
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
|
|
56
26
|
class GrokModel(DeepEvalBaseLLM):
|
|
57
27
|
def __init__(
|
|
58
28
|
self,
|
|
59
29
|
model: Optional[str] = None,
|
|
60
30
|
api_key: Optional[str] = None,
|
|
61
|
-
temperature: float =
|
|
31
|
+
temperature: Optional[float] = None,
|
|
32
|
+
cost_per_input_token: Optional[float] = None,
|
|
33
|
+
cost_per_output_token: Optional[float] = None,
|
|
62
34
|
generation_kwargs: Optional[Dict] = None,
|
|
63
35
|
**kwargs,
|
|
64
36
|
):
|
|
@@ -67,27 +39,62 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
67
39
|
|
|
68
40
|
model = model or settings.GROK_MODEL_NAME
|
|
69
41
|
|
|
70
|
-
if
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
temperature_from_key = settings.TEMPERATURE
|
|
75
|
-
if temperature_from_key is None:
|
|
76
|
-
self.temperature = temperature
|
|
42
|
+
if temperature is not None:
|
|
43
|
+
temperature = float(temperature)
|
|
44
|
+
elif settings.TEMPERATURE is not None:
|
|
45
|
+
temperature = settings.TEMPERATURE
|
|
77
46
|
else:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
47
|
+
temperature = 0.0
|
|
48
|
+
|
|
49
|
+
cost_per_input_token = (
|
|
50
|
+
cost_per_input_token
|
|
51
|
+
if cost_per_input_token is not None
|
|
52
|
+
else settings.GROK_COST_PER_INPUT_TOKEN
|
|
53
|
+
)
|
|
54
|
+
cost_per_output_token = (
|
|
55
|
+
cost_per_output_token
|
|
56
|
+
if cost_per_output_token is not None
|
|
57
|
+
else settings.GROK_COST_PER_OUTPUT_TOKEN
|
|
58
|
+
)
|
|
81
59
|
|
|
82
60
|
if api_key is not None:
|
|
83
61
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
84
|
-
self.api_key: SecretStr
|
|
62
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
85
63
|
else:
|
|
86
64
|
self.api_key = settings.GROK_API_KEY
|
|
87
65
|
|
|
66
|
+
model = require_param(
|
|
67
|
+
model,
|
|
68
|
+
provider_label="GrokModel",
|
|
69
|
+
env_var_name="GROK_MODEL_NAME",
|
|
70
|
+
param_hint="model",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# validation
|
|
74
|
+
if temperature < 0:
|
|
75
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
76
|
+
|
|
77
|
+
self.model_data = GROK_MODELS_DATA.get(model)
|
|
78
|
+
self.temperature = temperature
|
|
79
|
+
|
|
80
|
+
cost_per_input_token, cost_per_output_token = require_costs(
|
|
81
|
+
self.model_data,
|
|
82
|
+
model,
|
|
83
|
+
"GROK_COST_PER_INPUT_TOKEN",
|
|
84
|
+
"GROK_COST_PER_OUTPUT_TOKEN",
|
|
85
|
+
cost_per_input_token,
|
|
86
|
+
cost_per_output_token,
|
|
87
|
+
)
|
|
88
|
+
self.model_data.input_price = cost_per_input_token
|
|
89
|
+
self.model_data.output_price = cost_per_output_token
|
|
90
|
+
|
|
88
91
|
# Keep sanitized kwargs for client call to strip legacy keys
|
|
89
92
|
self.kwargs = kwargs
|
|
90
|
-
self.
|
|
93
|
+
self.kwargs.pop("temperature", None)
|
|
94
|
+
|
|
95
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
96
|
+
self.generation_kwargs.pop("temperature", None)
|
|
97
|
+
|
|
91
98
|
super().__init__(model)
|
|
92
99
|
|
|
93
100
|
###############################################
|
|
@@ -97,7 +104,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
97
104
|
@retry_grok
|
|
98
105
|
def generate(
|
|
99
106
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
100
|
-
) -> Tuple[Union[str,
|
|
107
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
101
108
|
|
|
102
109
|
try:
|
|
103
110
|
from xai_sdk.chat import user
|
|
@@ -105,15 +112,21 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
105
112
|
raise ImportError(
|
|
106
113
|
"xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
|
|
107
114
|
)
|
|
115
|
+
if check_if_multimodal(prompt):
|
|
116
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
117
|
+
content = self.generate_content(prompt)
|
|
118
|
+
else:
|
|
119
|
+
content = [{"type": "text", "text": prompt}]
|
|
120
|
+
|
|
108
121
|
client = self.load_model(async_mode=False)
|
|
109
122
|
chat = client.chat.create(
|
|
110
123
|
model=self.name,
|
|
111
124
|
temperature=self.temperature,
|
|
112
125
|
**self.generation_kwargs,
|
|
113
126
|
)
|
|
114
|
-
chat.append(user(
|
|
127
|
+
chat.append(user(content))
|
|
115
128
|
|
|
116
|
-
if schema and self.
|
|
129
|
+
if schema and self.supports_structured_outputs() is True:
|
|
117
130
|
response, structured_output = chat.parse(schema)
|
|
118
131
|
cost = self.calculate_cost(
|
|
119
132
|
response.usage.prompt_tokens,
|
|
@@ -136,7 +149,7 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
136
149
|
@retry_grok
|
|
137
150
|
async def a_generate(
|
|
138
151
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
139
|
-
) -> Tuple[Union[str,
|
|
152
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
140
153
|
|
|
141
154
|
try:
|
|
142
155
|
from xai_sdk.chat import user
|
|
@@ -144,15 +157,22 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
144
157
|
raise ImportError(
|
|
145
158
|
"xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
|
|
146
159
|
)
|
|
160
|
+
|
|
161
|
+
if check_if_multimodal(prompt):
|
|
162
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
163
|
+
content = self.generate_content(prompt)
|
|
164
|
+
else:
|
|
165
|
+
content = [{"type": "text", "text": prompt}]
|
|
166
|
+
|
|
147
167
|
client = self.load_model(async_mode=True)
|
|
148
168
|
chat = client.chat.create(
|
|
149
169
|
model=self.name,
|
|
150
170
|
temperature=self.temperature,
|
|
151
171
|
**self.generation_kwargs,
|
|
152
172
|
)
|
|
153
|
-
chat.append(user(
|
|
173
|
+
chat.append(user(content))
|
|
154
174
|
|
|
155
|
-
if schema and self.
|
|
175
|
+
if schema and self.supports_structured_outputs() is True:
|
|
156
176
|
response, structured_output = await chat.parse(schema)
|
|
157
177
|
cost = self.calculate_cost(
|
|
158
178
|
response.usage.prompt_tokens,
|
|
@@ -172,19 +192,62 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
172
192
|
else:
|
|
173
193
|
return output, cost
|
|
174
194
|
|
|
195
|
+
def generate_content(
|
|
196
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
197
|
+
):
|
|
198
|
+
content = []
|
|
199
|
+
for element in multimodal_input:
|
|
200
|
+
if isinstance(element, str):
|
|
201
|
+
content.append({"type": "text", "text": element})
|
|
202
|
+
elif isinstance(element, MLLMImage):
|
|
203
|
+
if element.url and not element.local:
|
|
204
|
+
content.append(
|
|
205
|
+
{
|
|
206
|
+
"type": "image_url",
|
|
207
|
+
"image_url": {"url": element.url},
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
element.ensure_images_loaded()
|
|
212
|
+
data_uri = (
|
|
213
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
214
|
+
)
|
|
215
|
+
content.append(
|
|
216
|
+
{
|
|
217
|
+
"type": "image_url",
|
|
218
|
+
"image_url": {"url": data_uri},
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
return content
|
|
222
|
+
|
|
175
223
|
###############################################
|
|
176
224
|
# Utilities
|
|
177
225
|
###############################################
|
|
178
226
|
|
|
179
|
-
def calculate_cost(
|
|
180
|
-
self
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
227
|
+
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
228
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
229
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
230
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
231
|
+
return input_cost + output_cost
|
|
232
|
+
|
|
233
|
+
###############################################
|
|
234
|
+
# Capabilities
|
|
235
|
+
###############################################
|
|
236
|
+
|
|
237
|
+
def supports_log_probs(self) -> Union[bool, None]:
|
|
238
|
+
return self.model_data.supports_log_probs
|
|
239
|
+
|
|
240
|
+
def supports_temperature(self) -> Union[bool, None]:
|
|
241
|
+
return self.model_data.supports_temperature
|
|
242
|
+
|
|
243
|
+
def supports_multimodal(self) -> Union[bool, None]:
|
|
244
|
+
return self.model_data.supports_multimodal
|
|
245
|
+
|
|
246
|
+
def supports_structured_outputs(self) -> Union[bool, None]:
|
|
247
|
+
return self.model_data.supports_structured_outputs
|
|
248
|
+
|
|
249
|
+
def supports_json_mode(self) -> Union[bool, None]:
|
|
250
|
+
return self.model_data.supports_json
|
|
188
251
|
|
|
189
252
|
###############################################
|
|
190
253
|
# Model
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
from typing import Optional, Tuple, Union, Dict
|
|
1
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
2
2
|
from openai import OpenAI, AsyncOpenAI
|
|
3
3
|
from pydantic import BaseModel, SecretStr
|
|
4
4
|
|
|
5
|
+
from deepeval.errors import DeepEvalError
|
|
5
6
|
from deepeval.config.settings import get_settings
|
|
6
7
|
from deepeval.models.retry_policy import (
|
|
7
8
|
create_retry_decorator,
|
|
@@ -9,105 +10,91 @@ from deepeval.models.retry_policy import (
|
|
|
9
10
|
)
|
|
10
11
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
11
12
|
from deepeval.models.utils import (
|
|
13
|
+
require_costs,
|
|
12
14
|
require_secret_api_key,
|
|
13
15
|
)
|
|
16
|
+
from deepeval.test_case import MLLMImage
|
|
17
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
14
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
15
19
|
from deepeval.constants import ProviderSlug as PS
|
|
16
|
-
|
|
20
|
+
from deepeval.models.llms.constants import KIMI_MODELS_DATA
|
|
21
|
+
from deepeval.utils import require_param
|
|
17
22
|
|
|
18
23
|
retry_kimi = create_retry_decorator(PS.KIMI)
|
|
19
24
|
|
|
20
|
-
json_mode_models = [
|
|
21
|
-
"kimi-thinking-preview",
|
|
22
|
-
"kimi-k2-0711-preview",
|
|
23
|
-
"kimi-latest-128k",
|
|
24
|
-
"kimi-latest-32k",
|
|
25
|
-
"kimi-latest-8k",
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
model_pricing = {
|
|
29
|
-
"kimi-latest-8k": {
|
|
30
|
-
"input": 0.20 / 1e6,
|
|
31
|
-
"output": 2.00 / 1e6,
|
|
32
|
-
},
|
|
33
|
-
"kimi-latest-32k": {
|
|
34
|
-
"input": 1.00 / 1e6,
|
|
35
|
-
"output": 3.00 / 1e6,
|
|
36
|
-
},
|
|
37
|
-
"kimi-latest-128k": {
|
|
38
|
-
"input": 2.00 / 1e6,
|
|
39
|
-
"output": 5.00 / 1e6,
|
|
40
|
-
},
|
|
41
|
-
"kimi-k2-0711-preview": {
|
|
42
|
-
"input": 0.60 / 1e6,
|
|
43
|
-
"output": 2.50 / 1e6,
|
|
44
|
-
},
|
|
45
|
-
"kimi-thinking-preview": {
|
|
46
|
-
"input": 30 / 1e6,
|
|
47
|
-
"output": 30 / 1e6,
|
|
48
|
-
},
|
|
49
|
-
"moonshot-v1-8k": {
|
|
50
|
-
"input": 1.00 / 1e6,
|
|
51
|
-
"output": 2.00 / 1e6,
|
|
52
|
-
},
|
|
53
|
-
"moonshot-v1-32k": {
|
|
54
|
-
"input": 2.00 / 1e6,
|
|
55
|
-
"output": 3.00 / 1e6,
|
|
56
|
-
},
|
|
57
|
-
"moonshot-v1-128k": {
|
|
58
|
-
"input": 0.20 / 1e6,
|
|
59
|
-
"output": 5.00 / 1e6,
|
|
60
|
-
},
|
|
61
|
-
"moonshot-v1-8k-vision-preview": {
|
|
62
|
-
"input": 1.00 / 1e6,
|
|
63
|
-
"output": 2.00 / 1e6,
|
|
64
|
-
},
|
|
65
|
-
"moonshot-v1-32k-vision-preview": {
|
|
66
|
-
"input": 2.00 / 1e6,
|
|
67
|
-
"output": 3.00 / 1e6,
|
|
68
|
-
},
|
|
69
|
-
"moonshot-v1-128k-vision-preview": {
|
|
70
|
-
"input": 0.20 / 1e6,
|
|
71
|
-
"output": 5.00 / 1e6,
|
|
72
|
-
},
|
|
73
|
-
}
|
|
74
|
-
|
|
75
25
|
|
|
76
26
|
class KimiModel(DeepEvalBaseLLM):
|
|
77
27
|
def __init__(
|
|
78
28
|
self,
|
|
79
29
|
model: Optional[str] = None,
|
|
80
30
|
api_key: Optional[str] = None,
|
|
81
|
-
temperature: float =
|
|
31
|
+
temperature: Optional[float] = None,
|
|
32
|
+
cost_per_input_token: Optional[float] = None,
|
|
33
|
+
cost_per_output_token: Optional[float] = None,
|
|
82
34
|
generation_kwargs: Optional[Dict] = None,
|
|
83
35
|
**kwargs,
|
|
84
36
|
):
|
|
85
37
|
settings = get_settings()
|
|
86
38
|
|
|
87
39
|
model = model or settings.MOONSHOT_MODEL_NAME
|
|
88
|
-
if model not in model_pricing:
|
|
89
|
-
raise ValueError(
|
|
90
|
-
f"Invalid model. Available Moonshot models: {', '.join(model_pricing.keys())}"
|
|
91
|
-
)
|
|
92
40
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
41
|
+
if temperature is not None:
|
|
42
|
+
temperature = float(temperature)
|
|
43
|
+
elif settings.TEMPERATURE is not None:
|
|
44
|
+
temperature = settings.TEMPERATURE
|
|
96
45
|
else:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
46
|
+
temperature = 0.0
|
|
47
|
+
|
|
48
|
+
cost_per_input_token = (
|
|
49
|
+
cost_per_input_token
|
|
50
|
+
if cost_per_input_token is not None
|
|
51
|
+
else settings.MOONSHOT_COST_PER_INPUT_TOKEN
|
|
52
|
+
)
|
|
53
|
+
cost_per_output_token = (
|
|
54
|
+
cost_per_output_token
|
|
55
|
+
if cost_per_output_token is not None
|
|
56
|
+
else settings.MOONSHOT_COST_PER_OUTPUT_TOKEN
|
|
57
|
+
)
|
|
100
58
|
|
|
101
59
|
if api_key is not None:
|
|
102
60
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
103
|
-
self.api_key: SecretStr
|
|
61
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
104
62
|
else:
|
|
105
63
|
self.api_key = settings.MOONSHOT_API_KEY
|
|
106
64
|
|
|
65
|
+
# validation
|
|
66
|
+
model = require_param(
|
|
67
|
+
model,
|
|
68
|
+
provider_label="KimiModel",
|
|
69
|
+
env_var_name="MOONSHOT_MODEL_NAME",
|
|
70
|
+
param_hint="model",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if temperature < 0:
|
|
74
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
75
|
+
|
|
76
|
+
self.model_data = KIMI_MODELS_DATA.get(model)
|
|
77
|
+
self.temperature = temperature
|
|
78
|
+
|
|
79
|
+
cost_per_input_token, cost_per_output_token = require_costs(
|
|
80
|
+
self.model_data,
|
|
81
|
+
model,
|
|
82
|
+
"MOONSHOT_COST_PER_INPUT_TOKEN",
|
|
83
|
+
"MOONSHOT_COST_PER_OUTPUT_TOKEN",
|
|
84
|
+
cost_per_input_token,
|
|
85
|
+
cost_per_output_token,
|
|
86
|
+
)
|
|
87
|
+
self.model_data.input_price = float(cost_per_input_token)
|
|
88
|
+
self.model_data.output_price = float(cost_per_output_token)
|
|
89
|
+
|
|
107
90
|
self.base_url = "https://api.moonshot.cn/v1"
|
|
108
91
|
# Keep sanitized kwargs for client call to strip legacy keys
|
|
109
92
|
self.kwargs = kwargs
|
|
110
|
-
self.
|
|
93
|
+
self.kwargs.pop("temperature", None)
|
|
94
|
+
|
|
95
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
96
|
+
self.generation_kwargs.pop("temperature", None)
|
|
97
|
+
|
|
111
98
|
super().__init__(model)
|
|
112
99
|
|
|
113
100
|
###############################################
|
|
@@ -117,13 +104,19 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
117
104
|
@retry_kimi
|
|
118
105
|
def generate(
|
|
119
106
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
120
|
-
) -> Tuple[Union[str,
|
|
107
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
108
|
+
|
|
109
|
+
if check_if_multimodal(prompt):
|
|
110
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
111
|
+
content = self.generate_content(prompt)
|
|
112
|
+
else:
|
|
113
|
+
content = [{"type": "text", "text": prompt}]
|
|
121
114
|
|
|
122
115
|
client = self.load_model(async_mode=False)
|
|
123
|
-
if schema and self.
|
|
116
|
+
if schema and self.supports_json_mode() is True:
|
|
124
117
|
completion = client.chat.completions.create(
|
|
125
118
|
model=self.name,
|
|
126
|
-
messages=[{"role": "user", "content":
|
|
119
|
+
messages=[{"role": "user", "content": content}],
|
|
127
120
|
response_format={"type": "json_object"},
|
|
128
121
|
temperature=self.temperature,
|
|
129
122
|
**self.generation_kwargs,
|
|
@@ -139,7 +132,7 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
139
132
|
|
|
140
133
|
completion = client.chat.completions.create(
|
|
141
134
|
model=self.name,
|
|
142
|
-
messages=[{"role": "user", "content":
|
|
135
|
+
messages=[{"role": "user", "content": content}],
|
|
143
136
|
**self.generation_kwargs,
|
|
144
137
|
)
|
|
145
138
|
output = completion.choices[0].message.content
|
|
@@ -156,13 +149,19 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
156
149
|
@retry_kimi
|
|
157
150
|
async def a_generate(
|
|
158
151
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
159
|
-
) -> Tuple[Union[str,
|
|
152
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
153
|
+
|
|
154
|
+
if check_if_multimodal(prompt):
|
|
155
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
156
|
+
content = self.generate_content(prompt)
|
|
157
|
+
else:
|
|
158
|
+
content = [{"type": "text", "text": prompt}]
|
|
160
159
|
|
|
161
160
|
client = self.load_model(async_mode=True)
|
|
162
|
-
if schema and self.
|
|
161
|
+
if schema and self.supports_json_mode() is True:
|
|
163
162
|
completion = await client.chat.completions.create(
|
|
164
163
|
model=self.name,
|
|
165
|
-
messages=[{"role": "user", "content":
|
|
164
|
+
messages=[{"role": "user", "content": content}],
|
|
166
165
|
response_format={"type": "json_object"},
|
|
167
166
|
temperature=self.temperature,
|
|
168
167
|
**self.generation_kwargs,
|
|
@@ -178,7 +177,7 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
178
177
|
|
|
179
178
|
completion = await client.chat.completions.create(
|
|
180
179
|
model=self.name,
|
|
181
|
-
messages=[{"role": "user", "content":
|
|
180
|
+
messages=[{"role": "user", "content": content}],
|
|
182
181
|
**self.generation_kwargs,
|
|
183
182
|
)
|
|
184
183
|
output = completion.choices[0].message.content
|
|
@@ -192,19 +191,62 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
192
191
|
else:
|
|
193
192
|
return output, cost
|
|
194
193
|
|
|
194
|
+
def generate_content(
|
|
195
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
196
|
+
):
|
|
197
|
+
content = []
|
|
198
|
+
for element in multimodal_input:
|
|
199
|
+
if isinstance(element, str):
|
|
200
|
+
content.append({"type": "text", "text": element})
|
|
201
|
+
elif isinstance(element, MLLMImage):
|
|
202
|
+
if element.url and not element.local:
|
|
203
|
+
content.append(
|
|
204
|
+
{
|
|
205
|
+
"type": "image_url",
|
|
206
|
+
"image_url": {"url": element.url},
|
|
207
|
+
}
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
element.ensure_images_loaded()
|
|
211
|
+
data_uri = (
|
|
212
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
213
|
+
)
|
|
214
|
+
content.append(
|
|
215
|
+
{
|
|
216
|
+
"type": "image_url",
|
|
217
|
+
"image_url": {"url": data_uri},
|
|
218
|
+
}
|
|
219
|
+
)
|
|
220
|
+
return content
|
|
221
|
+
|
|
195
222
|
###############################################
|
|
196
223
|
# Utilities
|
|
197
224
|
###############################################
|
|
198
225
|
|
|
199
|
-
def calculate_cost(
|
|
200
|
-
self
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
226
|
+
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
227
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
228
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
229
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
230
|
+
return input_cost + output_cost
|
|
231
|
+
|
|
232
|
+
###############################################
|
|
233
|
+
# Capabilities
|
|
234
|
+
###############################################
|
|
235
|
+
|
|
236
|
+
def supports_log_probs(self) -> Union[bool, None]:
|
|
237
|
+
return self.model_data.supports_log_probs
|
|
238
|
+
|
|
239
|
+
def supports_temperature(self) -> Union[bool, None]:
|
|
240
|
+
return self.model_data.supports_temperature
|
|
241
|
+
|
|
242
|
+
def supports_multimodal(self) -> Union[bool, None]:
|
|
243
|
+
return self.model_data.supports_multimodal
|
|
244
|
+
|
|
245
|
+
def supports_structured_outputs(self) -> Union[bool, None]:
|
|
246
|
+
return self.model_data.supports_structured_outputs
|
|
247
|
+
|
|
248
|
+
def supports_json_mode(self) -> Union[bool, None]:
|
|
249
|
+
return self.model_data.supports_json
|
|
208
250
|
|
|
209
251
|
###############################################
|
|
210
252
|
# Model
|