deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,91 +1,101 @@
1
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
2
2
  from pydantic import BaseModel, SecretStr
3
3
 
4
+ from deepeval.errors import DeepEvalError
4
5
  from deepeval.config.settings import get_settings
5
6
  from deepeval.models.retry_policy import (
6
7
  create_retry_decorator,
7
8
  sdk_retries_for,
8
9
  )
9
10
  from deepeval.models.llms.utils import trim_and_load_json
10
- from deepeval.models.utils import require_secret_api_key
11
+ from deepeval.models.utils import (
12
+ require_costs,
13
+ require_secret_api_key,
14
+ )
15
+ from deepeval.test_case import MLLMImage
16
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
11
17
  from deepeval.models import DeepEvalBaseLLM
12
18
  from deepeval.constants import ProviderSlug as PS
13
-
19
+ from deepeval.models.llms.constants import GROK_MODELS_DATA
20
+ from deepeval.utils import require_param
14
21
 
15
22
  # consistent retry rules
16
23
  retry_grok = create_retry_decorator(PS.GROK)
17
24
 
18
25
 
19
- structured_outputs_models = [
20
- "grok-4-0709",
21
- "grok-3",
22
- "grok-3-mini",
23
- "grok-3-fast",
24
- "grok-3-mini-fast",
25
- ]
26
-
27
- model_pricing = {
28
- "grok-4-0709": {
29
- "input": 0.20 / 1e6,
30
- "output": 2.00 / 1e6,
31
- },
32
- "grok-3": {
33
- "input": 1.00 / 1e6,
34
- "output": 3.00 / 1e6,
35
- },
36
- "grok-3-mini": {
37
- "input": 2.00 / 1e6,
38
- "output": 5.00 / 1e6,
39
- },
40
- "grok-3-fast": {
41
- "input": 0.60 / 1e6,
42
- "output": 2.50 / 1e6,
43
- },
44
- "grok-3-mini-fast": {
45
- "input": 30 / 1e6,
46
- "output": 30 / 1e6,
47
- },
48
- "grok-2-vision-1212": {
49
- "input": 1.00 / 1e6,
50
- "output": 2.00 / 1e6,
51
- },
52
- }
53
-
54
-
55
26
  class GrokModel(DeepEvalBaseLLM):
56
27
  def __init__(
57
28
  self,
58
29
  model: Optional[str] = None,
59
30
  api_key: Optional[str] = None,
60
- temperature: float = 0,
31
+ temperature: Optional[float] = None,
32
+ cost_per_input_token: Optional[float] = None,
33
+ cost_per_output_token: Optional[float] = None,
61
34
  generation_kwargs: Optional[Dict] = None,
62
35
  **kwargs,
63
36
  ):
37
+
64
38
  settings = get_settings()
65
39
 
66
- model_name = model or settings.GROK_MODEL_NAME
40
+ model = model or settings.GROK_MODEL_NAME
67
41
 
68
- if model_name not in model_pricing:
69
- raise ValueError(
70
- f"Invalid model. Available Grok models: {', '.join(model_pricing.keys())}"
71
- )
72
- temperature_from_key = settings.TEMPERATURE
73
- if temperature_from_key is None:
74
- self.temperature = temperature
42
+ if temperature is not None:
43
+ temperature = float(temperature)
44
+ elif settings.TEMPERATURE is not None:
45
+ temperature = settings.TEMPERATURE
75
46
  else:
76
- self.temperature = float(temperature_from_key)
77
- if self.temperature < 0:
78
- raise ValueError("Temperature must be >= 0.")
47
+ temperature = 0.0
48
+
49
+ cost_per_input_token = (
50
+ cost_per_input_token
51
+ if cost_per_input_token is not None
52
+ else settings.GROK_COST_PER_INPUT_TOKEN
53
+ )
54
+ cost_per_output_token = (
55
+ cost_per_output_token
56
+ if cost_per_output_token is not None
57
+ else settings.GROK_COST_PER_OUTPUT_TOKEN
58
+ )
79
59
 
80
60
  if api_key is not None:
81
61
  # keep it secret, keep it safe from serializings, logging and alike
82
- self.api_key: SecretStr | None = SecretStr(api_key)
62
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
83
63
  else:
84
64
  self.api_key = settings.GROK_API_KEY
85
65
 
66
+ model = require_param(
67
+ model,
68
+ provider_label="GrokModel",
69
+ env_var_name="GROK_MODEL_NAME",
70
+ param_hint="model",
71
+ )
72
+
73
+ # validation
74
+ if temperature < 0:
75
+ raise DeepEvalError("Temperature must be >= 0.")
76
+
77
+ self.model_data = GROK_MODELS_DATA.get(model)
78
+ self.temperature = temperature
79
+
80
+ cost_per_input_token, cost_per_output_token = require_costs(
81
+ self.model_data,
82
+ model,
83
+ "GROK_COST_PER_INPUT_TOKEN",
84
+ "GROK_COST_PER_OUTPUT_TOKEN",
85
+ cost_per_input_token,
86
+ cost_per_output_token,
87
+ )
88
+ self.model_data.input_price = cost_per_input_token
89
+ self.model_data.output_price = cost_per_output_token
90
+
91
+ # Keep sanitized kwargs for client call to strip legacy keys
86
92
  self.kwargs = kwargs
87
- self.generation_kwargs = generation_kwargs or {}
88
- super().__init__(model_name)
93
+ self.kwargs.pop("temperature", None)
94
+
95
+ self.generation_kwargs = dict(generation_kwargs or {})
96
+ self.generation_kwargs.pop("temperature", None)
97
+
98
+ super().__init__(model)
89
99
 
90
100
  ###############################################
91
101
  # Other generate functions
@@ -94,22 +104,29 @@ class GrokModel(DeepEvalBaseLLM):
94
104
  @retry_grok
95
105
  def generate(
96
106
  self, prompt: str, schema: Optional[BaseModel] = None
97
- ) -> Tuple[Union[str, Dict], float]:
107
+ ) -> Tuple[Union[str, BaseModel], float]:
108
+
98
109
  try:
99
110
  from xai_sdk.chat import user
100
111
  except ImportError:
101
112
  raise ImportError(
102
113
  "xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
103
114
  )
115
+ if check_if_multimodal(prompt):
116
+ prompt = convert_to_multi_modal_array(input=prompt)
117
+ content = self.generate_content(prompt)
118
+ else:
119
+ content = [{"type": "text", "text": prompt}]
120
+
104
121
  client = self.load_model(async_mode=False)
105
122
  chat = client.chat.create(
106
- model=self.model_name,
123
+ model=self.name,
107
124
  temperature=self.temperature,
108
125
  **self.generation_kwargs,
109
126
  )
110
- chat.append(user(prompt))
127
+ chat.append(user(content))
111
128
 
112
- if schema and self.model_name in structured_outputs_models:
129
+ if schema and self.supports_structured_outputs() is True:
113
130
  response, structured_output = chat.parse(schema)
114
131
  cost = self.calculate_cost(
115
132
  response.usage.prompt_tokens,
@@ -132,22 +149,30 @@ class GrokModel(DeepEvalBaseLLM):
132
149
  @retry_grok
133
150
  async def a_generate(
134
151
  self, prompt: str, schema: Optional[BaseModel] = None
135
- ) -> Tuple[Union[str, Dict], float]:
152
+ ) -> Tuple[Union[str, BaseModel], float]:
153
+
136
154
  try:
137
155
  from xai_sdk.chat import user
138
156
  except ImportError:
139
157
  raise ImportError(
140
158
  "xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
141
159
  )
160
+
161
+ if check_if_multimodal(prompt):
162
+ prompt = convert_to_multi_modal_array(input=prompt)
163
+ content = self.generate_content(prompt)
164
+ else:
165
+ content = [{"type": "text", "text": prompt}]
166
+
142
167
  client = self.load_model(async_mode=True)
143
168
  chat = client.chat.create(
144
- model=self.model_name,
169
+ model=self.name,
145
170
  temperature=self.temperature,
146
171
  **self.generation_kwargs,
147
172
  )
148
- chat.append(user(prompt))
173
+ chat.append(user(content))
149
174
 
150
- if schema and self.model_name in structured_outputs_models:
175
+ if schema and self.supports_structured_outputs() is True:
151
176
  response, structured_output = await chat.parse(schema)
152
177
  cost = self.calculate_cost(
153
178
  response.usage.prompt_tokens,
@@ -167,6 +192,34 @@ class GrokModel(DeepEvalBaseLLM):
167
192
  else:
168
193
  return output, cost
169
194
 
195
+ def generate_content(
196
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
197
+ ):
198
+ content = []
199
+ for element in multimodal_input:
200
+ if isinstance(element, str):
201
+ content.append({"type": "text", "text": element})
202
+ elif isinstance(element, MLLMImage):
203
+ if element.url and not element.local:
204
+ content.append(
205
+ {
206
+ "type": "image_url",
207
+ "image_url": {"url": element.url},
208
+ }
209
+ )
210
+ else:
211
+ element.ensure_images_loaded()
212
+ data_uri = (
213
+ f"data:{element.mimeType};base64,{element.dataBase64}"
214
+ )
215
+ content.append(
216
+ {
217
+ "type": "image_url",
218
+ "image_url": {"url": data_uri},
219
+ }
220
+ )
221
+ return content
222
+
170
223
  ###############################################
171
224
  # Utilities
172
225
  ###############################################
@@ -176,11 +229,29 @@ class GrokModel(DeepEvalBaseLLM):
176
229
  input_tokens: int,
177
230
  output_tokens: int,
178
231
  ) -> float:
179
- pricing = model_pricing.get(self.model_name, model_pricing)
180
- input_cost = input_tokens * pricing["input"]
181
- output_cost = output_tokens * pricing["output"]
232
+ input_cost = input_tokens * self.model_data.input_price
233
+ output_cost = output_tokens * self.model_data.output_price
182
234
  return input_cost + output_cost
183
235
 
236
+ ###############################################
237
+ # Capabilities
238
+ ###############################################
239
+
240
+ def supports_log_probs(self) -> Union[bool, None]:
241
+ return self.model_data.supports_log_probs
242
+
243
+ def supports_temperature(self) -> Union[bool, None]:
244
+ return self.model_data.supports_temperature
245
+
246
+ def supports_multimodal(self) -> Union[bool, None]:
247
+ return self.model_data.supports_multimodal
248
+
249
+ def supports_structured_outputs(self) -> Union[bool, None]:
250
+ return self.model_data.supports_structured_outputs
251
+
252
+ def supports_json_mode(self) -> Union[bool, None]:
253
+ return self.model_data.supports_json
254
+
184
255
  ###############################################
185
256
  # Model
186
257
  ###############################################
@@ -198,9 +269,6 @@ class GrokModel(DeepEvalBaseLLM):
198
269
  "xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
199
270
  )
200
271
 
201
- def get_model_name(self):
202
- return f"{self.model_name}"
203
-
204
272
  def _client_kwargs(self) -> Dict:
205
273
  """
206
274
  If Tenacity is managing retries, disable gRPC channel retries to avoid double retry.
@@ -242,3 +310,6 @@ class GrokModel(DeepEvalBaseLLM):
242
310
  kw.pop("channel_options", None)
243
311
  return cls(**kw)
244
312
  raise
313
+
314
+ def get_model_name(self):
315
+ return f"{self.name} (Grok)"
@@ -1,111 +1,101 @@
1
- from typing import Optional, Tuple, Union, Dict
1
+ from typing import Optional, Tuple, Union, Dict, List
2
2
  from openai import OpenAI, AsyncOpenAI
3
3
  from pydantic import BaseModel, SecretStr
4
4
 
5
+ from deepeval.errors import DeepEvalError
5
6
  from deepeval.config.settings import get_settings
6
7
  from deepeval.models.retry_policy import (
7
8
  create_retry_decorator,
8
9
  sdk_retries_for,
9
10
  )
10
11
  from deepeval.models.llms.utils import trim_and_load_json
11
- from deepeval.models.utils import require_secret_api_key
12
+ from deepeval.models.utils import (
13
+ require_costs,
14
+ require_secret_api_key,
15
+ )
16
+ from deepeval.test_case import MLLMImage
17
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
12
18
  from deepeval.models import DeepEvalBaseLLM
13
19
  from deepeval.constants import ProviderSlug as PS
14
-
20
+ from deepeval.models.llms.constants import KIMI_MODELS_DATA
21
+ from deepeval.utils import require_param
15
22
 
16
23
  retry_kimi = create_retry_decorator(PS.KIMI)
17
24
 
18
- json_mode_models = [
19
- "kimi-thinking-preview",
20
- "kimi-k2-0711-preview",
21
- "kimi-latest-128k",
22
- "kimi-latest-32k",
23
- "kimi-latest-8k",
24
- ]
25
-
26
- model_pricing = {
27
- "kimi-latest-8k": {
28
- "input": 0.20 / 1e6,
29
- "output": 2.00 / 1e6,
30
- },
31
- "kimi-latest-32k": {
32
- "input": 1.00 / 1e6,
33
- "output": 3.00 / 1e6,
34
- },
35
- "kimi-latest-128k": {
36
- "input": 2.00 / 1e6,
37
- "output": 5.00 / 1e6,
38
- },
39
- "kimi-k2-0711-preview": {
40
- "input": 0.60 / 1e6,
41
- "output": 2.50 / 1e6,
42
- },
43
- "kimi-thinking-preview": {
44
- "input": 30 / 1e6,
45
- "output": 30 / 1e6,
46
- },
47
- "moonshot-v1-8k": {
48
- "input": 1.00 / 1e6,
49
- "output": 2.00 / 1e6,
50
- },
51
- "moonshot-v1-32k": {
52
- "input": 2.00 / 1e6,
53
- "output": 3.00 / 1e6,
54
- },
55
- "moonshot-v1-128k": {
56
- "input": 0.20 / 1e6,
57
- "output": 5.00 / 1e6,
58
- },
59
- "moonshot-v1-8k-vision-preview": {
60
- "input": 1.00 / 1e6,
61
- "output": 2.00 / 1e6,
62
- },
63
- "moonshot-v1-32k-vision-preview": {
64
- "input": 2.00 / 1e6,
65
- "output": 3.00 / 1e6,
66
- },
67
- "moonshot-v1-128k-vision-preview": {
68
- "input": 0.20 / 1e6,
69
- "output": 5.00 / 1e6,
70
- },
71
- }
72
-
73
25
 
74
26
  class KimiModel(DeepEvalBaseLLM):
75
27
  def __init__(
76
28
  self,
77
- api_key: Optional[str] = None,
78
29
  model: Optional[str] = None,
79
- temperature: float = 0,
30
+ api_key: Optional[str] = None,
31
+ temperature: Optional[float] = None,
32
+ cost_per_input_token: Optional[float] = None,
33
+ cost_per_output_token: Optional[float] = None,
80
34
  generation_kwargs: Optional[Dict] = None,
81
35
  **kwargs,
82
36
  ):
83
37
  settings = get_settings()
84
38
 
85
- model_name = model or settings.MOONSHOT_MODEL_NAME
86
- if model_name not in model_pricing:
87
- raise ValueError(
88
- f"Invalid model. Available Moonshot models: {', '.join(model_pricing.keys())}"
89
- )
39
+ model = model or settings.MOONSHOT_MODEL_NAME
90
40
 
91
- temperature_from_key = settings.TEMPERATURE
92
- if temperature_from_key is None:
93
- self.temperature = temperature
41
+ if temperature is not None:
42
+ temperature = float(temperature)
43
+ elif settings.TEMPERATURE is not None:
44
+ temperature = settings.TEMPERATURE
94
45
  else:
95
- self.temperature = float(temperature_from_key)
96
- if self.temperature < 0:
97
- raise ValueError("Temperature must be >= 0.")
46
+ temperature = 0.0
47
+
48
+ cost_per_input_token = (
49
+ cost_per_input_token
50
+ if cost_per_input_token is not None
51
+ else settings.MOONSHOT_COST_PER_INPUT_TOKEN
52
+ )
53
+ cost_per_output_token = (
54
+ cost_per_output_token
55
+ if cost_per_output_token is not None
56
+ else settings.MOONSHOT_COST_PER_OUTPUT_TOKEN
57
+ )
98
58
 
99
59
  if api_key is not None:
100
60
  # keep it secret, keep it safe from serializings, logging and alike
101
- self.api_key: SecretStr | None = SecretStr(api_key)
61
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
102
62
  else:
103
63
  self.api_key = settings.MOONSHOT_API_KEY
104
64
 
65
+ # validation
66
+ model = require_param(
67
+ model,
68
+ provider_label="KimiModel",
69
+ env_var_name="MOONSHOT_MODEL_NAME",
70
+ param_hint="model",
71
+ )
72
+
73
+ if temperature < 0:
74
+ raise DeepEvalError("Temperature must be >= 0.")
75
+
76
+ self.model_data = KIMI_MODELS_DATA.get(model)
77
+ self.temperature = temperature
78
+
79
+ cost_per_input_token, cost_per_output_token = require_costs(
80
+ self.model_data,
81
+ model,
82
+ "MOONSHOT_COST_PER_INPUT_TOKEN",
83
+ "MOONSHOT_COST_PER_OUTPUT_TOKEN",
84
+ cost_per_input_token,
85
+ cost_per_output_token,
86
+ )
87
+ self.model_data.input_price = float(cost_per_input_token)
88
+ self.model_data.output_price = float(cost_per_output_token)
89
+
105
90
  self.base_url = "https://api.moonshot.cn/v1"
91
+ # Keep sanitized kwargs for client call to strip legacy keys
106
92
  self.kwargs = kwargs
107
- self.generation_kwargs = generation_kwargs or {}
108
- super().__init__(model_name)
93
+ self.kwargs.pop("temperature", None)
94
+
95
+ self.generation_kwargs = dict(generation_kwargs or {})
96
+ self.generation_kwargs.pop("temperature", None)
97
+
98
+ super().__init__(model)
109
99
 
110
100
  ###############################################
111
101
  # Other generate functions
@@ -114,12 +104,19 @@ class KimiModel(DeepEvalBaseLLM):
114
104
  @retry_kimi
115
105
  def generate(
116
106
  self, prompt: str, schema: Optional[BaseModel] = None
117
- ) -> Tuple[Union[str, Dict], float]:
107
+ ) -> Tuple[Union[str, BaseModel], float]:
108
+
109
+ if check_if_multimodal(prompt):
110
+ prompt = convert_to_multi_modal_array(input=prompt)
111
+ content = self.generate_content(prompt)
112
+ else:
113
+ content = [{"type": "text", "text": prompt}]
114
+
118
115
  client = self.load_model(async_mode=False)
119
- if schema and self.model_name in json_mode_models:
116
+ if schema and self.supports_json_mode() is True:
120
117
  completion = client.chat.completions.create(
121
- model=self.model_name,
122
- messages=[{"role": "user", "content": prompt}],
118
+ model=self.name,
119
+ messages=[{"role": "user", "content": content}],
123
120
  response_format={"type": "json_object"},
124
121
  temperature=self.temperature,
125
122
  **self.generation_kwargs,
@@ -134,8 +131,8 @@ class KimiModel(DeepEvalBaseLLM):
134
131
  return schema.model_validate(json_output), cost
135
132
 
136
133
  completion = client.chat.completions.create(
137
- model=self.model_name,
138
- messages=[{"role": "user", "content": prompt}],
134
+ model=self.name,
135
+ messages=[{"role": "user", "content": content}],
139
136
  **self.generation_kwargs,
140
137
  )
141
138
  output = completion.choices[0].message.content
@@ -152,12 +149,19 @@ class KimiModel(DeepEvalBaseLLM):
152
149
  @retry_kimi
153
150
  async def a_generate(
154
151
  self, prompt: str, schema: Optional[BaseModel] = None
155
- ) -> Tuple[Union[str, Dict], float]:
152
+ ) -> Tuple[Union[str, BaseModel], float]:
153
+
154
+ if check_if_multimodal(prompt):
155
+ prompt = convert_to_multi_modal_array(input=prompt)
156
+ content = self.generate_content(prompt)
157
+ else:
158
+ content = [{"type": "text", "text": prompt}]
159
+
156
160
  client = self.load_model(async_mode=True)
157
- if schema and self.model_name in json_mode_models:
161
+ if schema and self.supports_json_mode() is True:
158
162
  completion = await client.chat.completions.create(
159
- model=self.model_name,
160
- messages=[{"role": "user", "content": prompt}],
163
+ model=self.name,
164
+ messages=[{"role": "user", "content": content}],
161
165
  response_format={"type": "json_object"},
162
166
  temperature=self.temperature,
163
167
  **self.generation_kwargs,
@@ -172,8 +176,8 @@ class KimiModel(DeepEvalBaseLLM):
172
176
  return schema.model_validate(json_output), cost
173
177
 
174
178
  completion = await client.chat.completions.create(
175
- model=self.model_name,
176
- messages=[{"role": "user", "content": prompt}],
179
+ model=self.name,
180
+ messages=[{"role": "user", "content": content}],
177
181
  **self.generation_kwargs,
178
182
  )
179
183
  output = completion.choices[0].message.content
@@ -187,6 +191,34 @@ class KimiModel(DeepEvalBaseLLM):
187
191
  else:
188
192
  return output, cost
189
193
 
194
+ def generate_content(
195
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
196
+ ):
197
+ content = []
198
+ for element in multimodal_input:
199
+ if isinstance(element, str):
200
+ content.append({"type": "text", "text": element})
201
+ elif isinstance(element, MLLMImage):
202
+ if element.url and not element.local:
203
+ content.append(
204
+ {
205
+ "type": "image_url",
206
+ "image_url": {"url": element.url},
207
+ }
208
+ )
209
+ else:
210
+ element.ensure_images_loaded()
211
+ data_uri = (
212
+ f"data:{element.mimeType};base64,{element.dataBase64}"
213
+ )
214
+ content.append(
215
+ {
216
+ "type": "image_url",
217
+ "image_url": {"url": data_uri},
218
+ }
219
+ )
220
+ return content
221
+
190
222
  ###############################################
191
223
  # Utilities
192
224
  ###############################################
@@ -196,11 +228,29 @@ class KimiModel(DeepEvalBaseLLM):
196
228
  input_tokens: int,
197
229
  output_tokens: int,
198
230
  ) -> float:
199
- pricing = model_pricing.get(self.model_name, model_pricing)
200
- input_cost = input_tokens * pricing["input"]
201
- output_cost = output_tokens * pricing["output"]
231
+ input_cost = input_tokens * self.model_data.input_price
232
+ output_cost = output_tokens * self.model_data.output_price
202
233
  return input_cost + output_cost
203
234
 
235
+ ###############################################
236
+ # Capabilities
237
+ ###############################################
238
+
239
+ def supports_log_probs(self) -> Union[bool, None]:
240
+ return self.model_data.supports_log_probs
241
+
242
+ def supports_temperature(self) -> Union[bool, None]:
243
+ return self.model_data.supports_temperature
244
+
245
+ def supports_multimodal(self) -> Union[bool, None]:
246
+ return self.model_data.supports_multimodal
247
+
248
+ def supports_structured_outputs(self) -> Union[bool, None]:
249
+ return self.model_data.supports_structured_outputs
250
+
251
+ def supports_json_mode(self) -> Union[bool, None]:
252
+ return self.model_data.supports_json
253
+
204
254
  ###############################################
205
255
  # Model
206
256
  ###############################################
@@ -244,4 +294,4 @@ class KimiModel(DeepEvalBaseLLM):
244
294
  raise
245
295
 
246
296
  def get_model_name(self):
247
- return f"{self.model_name}"
297
+ return f"{self.name} (KIMI)"