deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,7 @@
1
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
2
2
  from pydantic import BaseModel, SecretStr
3
3
 
4
+ from deepeval.errors import DeepEvalError
4
5
  from deepeval.config.settings import get_settings
5
6
  from deepeval.models.retry_policy import (
6
7
  create_retry_decorator,
@@ -8,57 +9,28 @@ from deepeval.models.retry_policy import (
8
9
  )
9
10
  from deepeval.models.llms.utils import trim_and_load_json
10
11
  from deepeval.models.utils import (
12
+ require_costs,
11
13
  require_secret_api_key,
12
14
  )
15
+ from deepeval.test_case import MLLMImage
16
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
13
17
  from deepeval.models import DeepEvalBaseLLM
14
18
  from deepeval.constants import ProviderSlug as PS
19
+ from deepeval.models.llms.constants import GROK_MODELS_DATA
20
+ from deepeval.utils import require_param
15
21
 
16
22
  # consistent retry rules
17
23
  retry_grok = create_retry_decorator(PS.GROK)
18
24
 
19
25
 
20
- structured_outputs_models = [
21
- "grok-4-0709",
22
- "grok-3",
23
- "grok-3-mini",
24
- "grok-3-fast",
25
- "grok-3-mini-fast",
26
- ]
27
-
28
- model_pricing = {
29
- "grok-4-0709": {
30
- "input": 0.20 / 1e6,
31
- "output": 2.00 / 1e6,
32
- },
33
- "grok-3": {
34
- "input": 1.00 / 1e6,
35
- "output": 3.00 / 1e6,
36
- },
37
- "grok-3-mini": {
38
- "input": 2.00 / 1e6,
39
- "output": 5.00 / 1e6,
40
- },
41
- "grok-3-fast": {
42
- "input": 0.60 / 1e6,
43
- "output": 2.50 / 1e6,
44
- },
45
- "grok-3-mini-fast": {
46
- "input": 30 / 1e6,
47
- "output": 30 / 1e6,
48
- },
49
- "grok-2-vision-1212": {
50
- "input": 1.00 / 1e6,
51
- "output": 2.00 / 1e6,
52
- },
53
- }
54
-
55
-
56
26
  class GrokModel(DeepEvalBaseLLM):
57
27
  def __init__(
58
28
  self,
59
29
  model: Optional[str] = None,
60
30
  api_key: Optional[str] = None,
61
- temperature: float = 0,
31
+ temperature: Optional[float] = None,
32
+ cost_per_input_token: Optional[float] = None,
33
+ cost_per_output_token: Optional[float] = None,
62
34
  generation_kwargs: Optional[Dict] = None,
63
35
  **kwargs,
64
36
  ):
@@ -67,27 +39,62 @@ class GrokModel(DeepEvalBaseLLM):
67
39
 
68
40
  model = model or settings.GROK_MODEL_NAME
69
41
 
70
- if model not in model_pricing:
71
- raise ValueError(
72
- f"Invalid model. Available Grok models: {', '.join(model_pricing.keys())}"
73
- )
74
- temperature_from_key = settings.TEMPERATURE
75
- if temperature_from_key is None:
76
- self.temperature = temperature
42
+ if temperature is not None:
43
+ temperature = float(temperature)
44
+ elif settings.TEMPERATURE is not None:
45
+ temperature = settings.TEMPERATURE
77
46
  else:
78
- self.temperature = float(temperature_from_key)
79
- if self.temperature < 0:
80
- raise ValueError("Temperature must be >= 0.")
47
+ temperature = 0.0
48
+
49
+ cost_per_input_token = (
50
+ cost_per_input_token
51
+ if cost_per_input_token is not None
52
+ else settings.GROK_COST_PER_INPUT_TOKEN
53
+ )
54
+ cost_per_output_token = (
55
+ cost_per_output_token
56
+ if cost_per_output_token is not None
57
+ else settings.GROK_COST_PER_OUTPUT_TOKEN
58
+ )
81
59
 
82
60
  if api_key is not None:
83
61
  # keep it secret, keep it safe from serializings, logging and alike
84
- self.api_key: SecretStr | None = SecretStr(api_key)
62
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
85
63
  else:
86
64
  self.api_key = settings.GROK_API_KEY
87
65
 
66
+ model = require_param(
67
+ model,
68
+ provider_label="GrokModel",
69
+ env_var_name="GROK_MODEL_NAME",
70
+ param_hint="model",
71
+ )
72
+
73
+ # validation
74
+ if temperature < 0:
75
+ raise DeepEvalError("Temperature must be >= 0.")
76
+
77
+ self.model_data = GROK_MODELS_DATA.get(model)
78
+ self.temperature = temperature
79
+
80
+ cost_per_input_token, cost_per_output_token = require_costs(
81
+ self.model_data,
82
+ model,
83
+ "GROK_COST_PER_INPUT_TOKEN",
84
+ "GROK_COST_PER_OUTPUT_TOKEN",
85
+ cost_per_input_token,
86
+ cost_per_output_token,
87
+ )
88
+ self.model_data.input_price = cost_per_input_token
89
+ self.model_data.output_price = cost_per_output_token
90
+
88
91
  # Keep sanitized kwargs for client call to strip legacy keys
89
92
  self.kwargs = kwargs
90
- self.generation_kwargs = generation_kwargs or {}
93
+ self.kwargs.pop("temperature", None)
94
+
95
+ self.generation_kwargs = dict(generation_kwargs or {})
96
+ self.generation_kwargs.pop("temperature", None)
97
+
91
98
  super().__init__(model)
92
99
 
93
100
  ###############################################
@@ -97,7 +104,7 @@ class GrokModel(DeepEvalBaseLLM):
97
104
  @retry_grok
98
105
  def generate(
99
106
  self, prompt: str, schema: Optional[BaseModel] = None
100
- ) -> Tuple[Union[str, Dict], float]:
107
+ ) -> Tuple[Union[str, BaseModel], float]:
101
108
 
102
109
  try:
103
110
  from xai_sdk.chat import user
@@ -105,15 +112,21 @@ class GrokModel(DeepEvalBaseLLM):
105
112
  raise ImportError(
106
113
  "xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
107
114
  )
115
+ if check_if_multimodal(prompt):
116
+ prompt = convert_to_multi_modal_array(input=prompt)
117
+ content = self.generate_content(prompt)
118
+ else:
119
+ content = [{"type": "text", "text": prompt}]
120
+
108
121
  client = self.load_model(async_mode=False)
109
122
  chat = client.chat.create(
110
123
  model=self.name,
111
124
  temperature=self.temperature,
112
125
  **self.generation_kwargs,
113
126
  )
114
- chat.append(user(prompt))
127
+ chat.append(user(content))
115
128
 
116
- if schema and self.name in structured_outputs_models:
129
+ if schema and self.supports_structured_outputs() is True:
117
130
  response, structured_output = chat.parse(schema)
118
131
  cost = self.calculate_cost(
119
132
  response.usage.prompt_tokens,
@@ -136,7 +149,7 @@ class GrokModel(DeepEvalBaseLLM):
136
149
  @retry_grok
137
150
  async def a_generate(
138
151
  self, prompt: str, schema: Optional[BaseModel] = None
139
- ) -> Tuple[Union[str, Dict], float]:
152
+ ) -> Tuple[Union[str, BaseModel], float]:
140
153
 
141
154
  try:
142
155
  from xai_sdk.chat import user
@@ -144,15 +157,22 @@ class GrokModel(DeepEvalBaseLLM):
144
157
  raise ImportError(
145
158
  "xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
146
159
  )
160
+
161
+ if check_if_multimodal(prompt):
162
+ prompt = convert_to_multi_modal_array(input=prompt)
163
+ content = self.generate_content(prompt)
164
+ else:
165
+ content = [{"type": "text", "text": prompt}]
166
+
147
167
  client = self.load_model(async_mode=True)
148
168
  chat = client.chat.create(
149
169
  model=self.name,
150
170
  temperature=self.temperature,
151
171
  **self.generation_kwargs,
152
172
  )
153
- chat.append(user(prompt))
173
+ chat.append(user(content))
154
174
 
155
- if schema and self.name in structured_outputs_models:
175
+ if schema and self.supports_structured_outputs() is True:
156
176
  response, structured_output = await chat.parse(schema)
157
177
  cost = self.calculate_cost(
158
178
  response.usage.prompt_tokens,
@@ -172,6 +192,34 @@ class GrokModel(DeepEvalBaseLLM):
172
192
  else:
173
193
  return output, cost
174
194
 
195
+ def generate_content(
196
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
197
+ ):
198
+ content = []
199
+ for element in multimodal_input:
200
+ if isinstance(element, str):
201
+ content.append({"type": "text", "text": element})
202
+ elif isinstance(element, MLLMImage):
203
+ if element.url and not element.local:
204
+ content.append(
205
+ {
206
+ "type": "image_url",
207
+ "image_url": {"url": element.url},
208
+ }
209
+ )
210
+ else:
211
+ element.ensure_images_loaded()
212
+ data_uri = (
213
+ f"data:{element.mimeType};base64,{element.dataBase64}"
214
+ )
215
+ content.append(
216
+ {
217
+ "type": "image_url",
218
+ "image_url": {"url": data_uri},
219
+ }
220
+ )
221
+ return content
222
+
175
223
  ###############################################
176
224
  # Utilities
177
225
  ###############################################
@@ -181,11 +229,29 @@ class GrokModel(DeepEvalBaseLLM):
181
229
  input_tokens: int,
182
230
  output_tokens: int,
183
231
  ) -> float:
184
- pricing = model_pricing.get(self.name, model_pricing)
185
- input_cost = input_tokens * pricing["input"]
186
- output_cost = output_tokens * pricing["output"]
232
+ input_cost = input_tokens * self.model_data.input_price
233
+ output_cost = output_tokens * self.model_data.output_price
187
234
  return input_cost + output_cost
188
235
 
236
+ ###############################################
237
+ # Capabilities
238
+ ###############################################
239
+
240
+ def supports_log_probs(self) -> Union[bool, None]:
241
+ return self.model_data.supports_log_probs
242
+
243
+ def supports_temperature(self) -> Union[bool, None]:
244
+ return self.model_data.supports_temperature
245
+
246
+ def supports_multimodal(self) -> Union[bool, None]:
247
+ return self.model_data.supports_multimodal
248
+
249
+ def supports_structured_outputs(self) -> Union[bool, None]:
250
+ return self.model_data.supports_structured_outputs
251
+
252
+ def supports_json_mode(self) -> Union[bool, None]:
253
+ return self.model_data.supports_json
254
+
189
255
  ###############################################
190
256
  # Model
191
257
  ###############################################
@@ -1,7 +1,8 @@
1
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
2
2
  from openai import OpenAI, AsyncOpenAI
3
3
  from pydantic import BaseModel, SecretStr
4
4
 
5
+ from deepeval.errors import DeepEvalError
5
6
  from deepeval.config.settings import get_settings
6
7
  from deepeval.models.retry_policy import (
7
8
  create_retry_decorator,
@@ -9,105 +10,91 @@ from deepeval.models.retry_policy import (
9
10
  )
10
11
  from deepeval.models.llms.utils import trim_and_load_json
11
12
  from deepeval.models.utils import (
13
+ require_costs,
12
14
  require_secret_api_key,
13
15
  )
16
+ from deepeval.test_case import MLLMImage
17
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
14
18
  from deepeval.models import DeepEvalBaseLLM
15
19
  from deepeval.constants import ProviderSlug as PS
16
-
20
+ from deepeval.models.llms.constants import KIMI_MODELS_DATA
21
+ from deepeval.utils import require_param
17
22
 
18
23
  retry_kimi = create_retry_decorator(PS.KIMI)
19
24
 
20
- json_mode_models = [
21
- "kimi-thinking-preview",
22
- "kimi-k2-0711-preview",
23
- "kimi-latest-128k",
24
- "kimi-latest-32k",
25
- "kimi-latest-8k",
26
- ]
27
-
28
- model_pricing = {
29
- "kimi-latest-8k": {
30
- "input": 0.20 / 1e6,
31
- "output": 2.00 / 1e6,
32
- },
33
- "kimi-latest-32k": {
34
- "input": 1.00 / 1e6,
35
- "output": 3.00 / 1e6,
36
- },
37
- "kimi-latest-128k": {
38
- "input": 2.00 / 1e6,
39
- "output": 5.00 / 1e6,
40
- },
41
- "kimi-k2-0711-preview": {
42
- "input": 0.60 / 1e6,
43
- "output": 2.50 / 1e6,
44
- },
45
- "kimi-thinking-preview": {
46
- "input": 30 / 1e6,
47
- "output": 30 / 1e6,
48
- },
49
- "moonshot-v1-8k": {
50
- "input": 1.00 / 1e6,
51
- "output": 2.00 / 1e6,
52
- },
53
- "moonshot-v1-32k": {
54
- "input": 2.00 / 1e6,
55
- "output": 3.00 / 1e6,
56
- },
57
- "moonshot-v1-128k": {
58
- "input": 0.20 / 1e6,
59
- "output": 5.00 / 1e6,
60
- },
61
- "moonshot-v1-8k-vision-preview": {
62
- "input": 1.00 / 1e6,
63
- "output": 2.00 / 1e6,
64
- },
65
- "moonshot-v1-32k-vision-preview": {
66
- "input": 2.00 / 1e6,
67
- "output": 3.00 / 1e6,
68
- },
69
- "moonshot-v1-128k-vision-preview": {
70
- "input": 0.20 / 1e6,
71
- "output": 5.00 / 1e6,
72
- },
73
- }
74
-
75
25
 
76
26
  class KimiModel(DeepEvalBaseLLM):
77
27
  def __init__(
78
28
  self,
79
29
  model: Optional[str] = None,
80
30
  api_key: Optional[str] = None,
81
- temperature: float = 0,
31
+ temperature: Optional[float] = None,
32
+ cost_per_input_token: Optional[float] = None,
33
+ cost_per_output_token: Optional[float] = None,
82
34
  generation_kwargs: Optional[Dict] = None,
83
35
  **kwargs,
84
36
  ):
85
37
  settings = get_settings()
86
38
 
87
39
  model = model or settings.MOONSHOT_MODEL_NAME
88
- if model not in model_pricing:
89
- raise ValueError(
90
- f"Invalid model. Available Moonshot models: {', '.join(model_pricing.keys())}"
91
- )
92
40
 
93
- temperature_from_key = settings.TEMPERATURE
94
- if temperature_from_key is None:
95
- self.temperature = temperature
41
+ if temperature is not None:
42
+ temperature = float(temperature)
43
+ elif settings.TEMPERATURE is not None:
44
+ temperature = settings.TEMPERATURE
96
45
  else:
97
- self.temperature = float(temperature_from_key)
98
- if self.temperature < 0:
99
- raise ValueError("Temperature must be >= 0.")
46
+ temperature = 0.0
47
+
48
+ cost_per_input_token = (
49
+ cost_per_input_token
50
+ if cost_per_input_token is not None
51
+ else settings.MOONSHOT_COST_PER_INPUT_TOKEN
52
+ )
53
+ cost_per_output_token = (
54
+ cost_per_output_token
55
+ if cost_per_output_token is not None
56
+ else settings.MOONSHOT_COST_PER_OUTPUT_TOKEN
57
+ )
100
58
 
101
59
  if api_key is not None:
102
60
  # keep it secret, keep it safe from serializings, logging and alike
103
- self.api_key: SecretStr | None = SecretStr(api_key)
61
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
104
62
  else:
105
63
  self.api_key = settings.MOONSHOT_API_KEY
106
64
 
65
+ # validation
66
+ model = require_param(
67
+ model,
68
+ provider_label="KimiModel",
69
+ env_var_name="MOONSHOT_MODEL_NAME",
70
+ param_hint="model",
71
+ )
72
+
73
+ if temperature < 0:
74
+ raise DeepEvalError("Temperature must be >= 0.")
75
+
76
+ self.model_data = KIMI_MODELS_DATA.get(model)
77
+ self.temperature = temperature
78
+
79
+ cost_per_input_token, cost_per_output_token = require_costs(
80
+ self.model_data,
81
+ model,
82
+ "MOONSHOT_COST_PER_INPUT_TOKEN",
83
+ "MOONSHOT_COST_PER_OUTPUT_TOKEN",
84
+ cost_per_input_token,
85
+ cost_per_output_token,
86
+ )
87
+ self.model_data.input_price = float(cost_per_input_token)
88
+ self.model_data.output_price = float(cost_per_output_token)
89
+
107
90
  self.base_url = "https://api.moonshot.cn/v1"
108
91
  # Keep sanitized kwargs for client call to strip legacy keys
109
92
  self.kwargs = kwargs
110
- self.generation_kwargs = generation_kwargs or {}
93
+ self.kwargs.pop("temperature", None)
94
+
95
+ self.generation_kwargs = dict(generation_kwargs or {})
96
+ self.generation_kwargs.pop("temperature", None)
97
+
111
98
  super().__init__(model)
112
99
 
113
100
  ###############################################
@@ -117,13 +104,19 @@ class KimiModel(DeepEvalBaseLLM):
117
104
  @retry_kimi
118
105
  def generate(
119
106
  self, prompt: str, schema: Optional[BaseModel] = None
120
- ) -> Tuple[Union[str, Dict], float]:
107
+ ) -> Tuple[Union[str, BaseModel], float]:
108
+
109
+ if check_if_multimodal(prompt):
110
+ prompt = convert_to_multi_modal_array(input=prompt)
111
+ content = self.generate_content(prompt)
112
+ else:
113
+ content = [{"type": "text", "text": prompt}]
121
114
 
122
115
  client = self.load_model(async_mode=False)
123
- if schema and self.name in json_mode_models:
116
+ if schema and self.supports_json_mode() is True:
124
117
  completion = client.chat.completions.create(
125
118
  model=self.name,
126
- messages=[{"role": "user", "content": prompt}],
119
+ messages=[{"role": "user", "content": content}],
127
120
  response_format={"type": "json_object"},
128
121
  temperature=self.temperature,
129
122
  **self.generation_kwargs,
@@ -139,7 +132,7 @@ class KimiModel(DeepEvalBaseLLM):
139
132
 
140
133
  completion = client.chat.completions.create(
141
134
  model=self.name,
142
- messages=[{"role": "user", "content": prompt}],
135
+ messages=[{"role": "user", "content": content}],
143
136
  **self.generation_kwargs,
144
137
  )
145
138
  output = completion.choices[0].message.content
@@ -156,13 +149,19 @@ class KimiModel(DeepEvalBaseLLM):
156
149
  @retry_kimi
157
150
  async def a_generate(
158
151
  self, prompt: str, schema: Optional[BaseModel] = None
159
- ) -> Tuple[Union[str, Dict], float]:
152
+ ) -> Tuple[Union[str, BaseModel], float]:
153
+
154
+ if check_if_multimodal(prompt):
155
+ prompt = convert_to_multi_modal_array(input=prompt)
156
+ content = self.generate_content(prompt)
157
+ else:
158
+ content = [{"type": "text", "text": prompt}]
160
159
 
161
160
  client = self.load_model(async_mode=True)
162
- if schema and self.name in json_mode_models:
161
+ if schema and self.supports_json_mode() is True:
163
162
  completion = await client.chat.completions.create(
164
163
  model=self.name,
165
- messages=[{"role": "user", "content": prompt}],
164
+ messages=[{"role": "user", "content": content}],
166
165
  response_format={"type": "json_object"},
167
166
  temperature=self.temperature,
168
167
  **self.generation_kwargs,
@@ -178,7 +177,7 @@ class KimiModel(DeepEvalBaseLLM):
178
177
 
179
178
  completion = await client.chat.completions.create(
180
179
  model=self.name,
181
- messages=[{"role": "user", "content": prompt}],
180
+ messages=[{"role": "user", "content": content}],
182
181
  **self.generation_kwargs,
183
182
  )
184
183
  output = completion.choices[0].message.content
@@ -192,6 +191,34 @@ class KimiModel(DeepEvalBaseLLM):
192
191
  else:
193
192
  return output, cost
194
193
 
194
+ def generate_content(
195
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
196
+ ):
197
+ content = []
198
+ for element in multimodal_input:
199
+ if isinstance(element, str):
200
+ content.append({"type": "text", "text": element})
201
+ elif isinstance(element, MLLMImage):
202
+ if element.url and not element.local:
203
+ content.append(
204
+ {
205
+ "type": "image_url",
206
+ "image_url": {"url": element.url},
207
+ }
208
+ )
209
+ else:
210
+ element.ensure_images_loaded()
211
+ data_uri = (
212
+ f"data:{element.mimeType};base64,{element.dataBase64}"
213
+ )
214
+ content.append(
215
+ {
216
+ "type": "image_url",
217
+ "image_url": {"url": data_uri},
218
+ }
219
+ )
220
+ return content
221
+
195
222
  ###############################################
196
223
  # Utilities
197
224
  ###############################################
@@ -201,11 +228,29 @@ class KimiModel(DeepEvalBaseLLM):
201
228
  input_tokens: int,
202
229
  output_tokens: int,
203
230
  ) -> float:
204
- pricing = model_pricing.get(self.name, model_pricing)
205
- input_cost = input_tokens * pricing["input"]
206
- output_cost = output_tokens * pricing["output"]
231
+ input_cost = input_tokens * self.model_data.input_price
232
+ output_cost = output_tokens * self.model_data.output_price
207
233
  return input_cost + output_cost
208
234
 
235
+ ###############################################
236
+ # Capabilities
237
+ ###############################################
238
+
239
+ def supports_log_probs(self) -> Union[bool, None]:
240
+ return self.model_data.supports_log_probs
241
+
242
+ def supports_temperature(self) -> Union[bool, None]:
243
+ return self.model_data.supports_temperature
244
+
245
+ def supports_multimodal(self) -> Union[bool, None]:
246
+ return self.model_data.supports_multimodal
247
+
248
+ def supports_structured_outputs(self) -> Union[bool, None]:
249
+ return self.model_data.supports_structured_outputs
250
+
251
+ def supports_json_mode(self) -> Union[bool, None]:
252
+ return self.model_data.supports_json
253
+
209
254
  ###############################################
210
255
  # Model
211
256
  ###############################################