deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,91 +1,101 @@
|
|
|
1
|
-
from typing import Optional, Tuple, Union, Dict
|
|
1
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
2
2
|
from pydantic import BaseModel, SecretStr
|
|
3
3
|
|
|
4
|
+
from deepeval.errors import DeepEvalError
|
|
4
5
|
from deepeval.config.settings import get_settings
|
|
5
6
|
from deepeval.models.retry_policy import (
|
|
6
7
|
create_retry_decorator,
|
|
7
8
|
sdk_retries_for,
|
|
8
9
|
)
|
|
9
10
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
10
|
-
from deepeval.models.utils import
|
|
11
|
+
from deepeval.models.utils import (
|
|
12
|
+
require_costs,
|
|
13
|
+
require_secret_api_key,
|
|
14
|
+
)
|
|
15
|
+
from deepeval.test_case import MLLMImage
|
|
16
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
11
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
12
18
|
from deepeval.constants import ProviderSlug as PS
|
|
13
|
-
|
|
19
|
+
from deepeval.models.llms.constants import GROK_MODELS_DATA
|
|
20
|
+
from deepeval.utils import require_param
|
|
14
21
|
|
|
15
22
|
# consistent retry rules
|
|
16
23
|
retry_grok = create_retry_decorator(PS.GROK)
|
|
17
24
|
|
|
18
25
|
|
|
19
|
-
structured_outputs_models = [
|
|
20
|
-
"grok-4-0709",
|
|
21
|
-
"grok-3",
|
|
22
|
-
"grok-3-mini",
|
|
23
|
-
"grok-3-fast",
|
|
24
|
-
"grok-3-mini-fast",
|
|
25
|
-
]
|
|
26
|
-
|
|
27
|
-
model_pricing = {
|
|
28
|
-
"grok-4-0709": {
|
|
29
|
-
"input": 0.20 / 1e6,
|
|
30
|
-
"output": 2.00 / 1e6,
|
|
31
|
-
},
|
|
32
|
-
"grok-3": {
|
|
33
|
-
"input": 1.00 / 1e6,
|
|
34
|
-
"output": 3.00 / 1e6,
|
|
35
|
-
},
|
|
36
|
-
"grok-3-mini": {
|
|
37
|
-
"input": 2.00 / 1e6,
|
|
38
|
-
"output": 5.00 / 1e6,
|
|
39
|
-
},
|
|
40
|
-
"grok-3-fast": {
|
|
41
|
-
"input": 0.60 / 1e6,
|
|
42
|
-
"output": 2.50 / 1e6,
|
|
43
|
-
},
|
|
44
|
-
"grok-3-mini-fast": {
|
|
45
|
-
"input": 30 / 1e6,
|
|
46
|
-
"output": 30 / 1e6,
|
|
47
|
-
},
|
|
48
|
-
"grok-2-vision-1212": {
|
|
49
|
-
"input": 1.00 / 1e6,
|
|
50
|
-
"output": 2.00 / 1e6,
|
|
51
|
-
},
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
|
|
55
26
|
class GrokModel(DeepEvalBaseLLM):
|
|
56
27
|
def __init__(
|
|
57
28
|
self,
|
|
58
29
|
model: Optional[str] = None,
|
|
59
30
|
api_key: Optional[str] = None,
|
|
60
|
-
temperature: float =
|
|
31
|
+
temperature: Optional[float] = None,
|
|
32
|
+
cost_per_input_token: Optional[float] = None,
|
|
33
|
+
cost_per_output_token: Optional[float] = None,
|
|
61
34
|
generation_kwargs: Optional[Dict] = None,
|
|
62
35
|
**kwargs,
|
|
63
36
|
):
|
|
37
|
+
|
|
64
38
|
settings = get_settings()
|
|
65
39
|
|
|
66
|
-
|
|
40
|
+
model = model or settings.GROK_MODEL_NAME
|
|
67
41
|
|
|
68
|
-
if
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
temperature_from_key = settings.TEMPERATURE
|
|
73
|
-
if temperature_from_key is None:
|
|
74
|
-
self.temperature = temperature
|
|
42
|
+
if temperature is not None:
|
|
43
|
+
temperature = float(temperature)
|
|
44
|
+
elif settings.TEMPERATURE is not None:
|
|
45
|
+
temperature = settings.TEMPERATURE
|
|
75
46
|
else:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
47
|
+
temperature = 0.0
|
|
48
|
+
|
|
49
|
+
cost_per_input_token = (
|
|
50
|
+
cost_per_input_token
|
|
51
|
+
if cost_per_input_token is not None
|
|
52
|
+
else settings.GROK_COST_PER_INPUT_TOKEN
|
|
53
|
+
)
|
|
54
|
+
cost_per_output_token = (
|
|
55
|
+
cost_per_output_token
|
|
56
|
+
if cost_per_output_token is not None
|
|
57
|
+
else settings.GROK_COST_PER_OUTPUT_TOKEN
|
|
58
|
+
)
|
|
79
59
|
|
|
80
60
|
if api_key is not None:
|
|
81
61
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
82
|
-
self.api_key: SecretStr
|
|
62
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
83
63
|
else:
|
|
84
64
|
self.api_key = settings.GROK_API_KEY
|
|
85
65
|
|
|
66
|
+
model = require_param(
|
|
67
|
+
model,
|
|
68
|
+
provider_label="GrokModel",
|
|
69
|
+
env_var_name="GROK_MODEL_NAME",
|
|
70
|
+
param_hint="model",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# validation
|
|
74
|
+
if temperature < 0:
|
|
75
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
76
|
+
|
|
77
|
+
self.model_data = GROK_MODELS_DATA.get(model)
|
|
78
|
+
self.temperature = temperature
|
|
79
|
+
|
|
80
|
+
cost_per_input_token, cost_per_output_token = require_costs(
|
|
81
|
+
self.model_data,
|
|
82
|
+
model,
|
|
83
|
+
"GROK_COST_PER_INPUT_TOKEN",
|
|
84
|
+
"GROK_COST_PER_OUTPUT_TOKEN",
|
|
85
|
+
cost_per_input_token,
|
|
86
|
+
cost_per_output_token,
|
|
87
|
+
)
|
|
88
|
+
self.model_data.input_price = cost_per_input_token
|
|
89
|
+
self.model_data.output_price = cost_per_output_token
|
|
90
|
+
|
|
91
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
86
92
|
self.kwargs = kwargs
|
|
87
|
-
self.
|
|
88
|
-
|
|
93
|
+
self.kwargs.pop("temperature", None)
|
|
94
|
+
|
|
95
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
96
|
+
self.generation_kwargs.pop("temperature", None)
|
|
97
|
+
|
|
98
|
+
super().__init__(model)
|
|
89
99
|
|
|
90
100
|
###############################################
|
|
91
101
|
# Other generate functions
|
|
@@ -94,22 +104,29 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
94
104
|
@retry_grok
|
|
95
105
|
def generate(
|
|
96
106
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
97
|
-
) -> Tuple[Union[str,
|
|
107
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
108
|
+
|
|
98
109
|
try:
|
|
99
110
|
from xai_sdk.chat import user
|
|
100
111
|
except ImportError:
|
|
101
112
|
raise ImportError(
|
|
102
113
|
"xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
|
|
103
114
|
)
|
|
115
|
+
if check_if_multimodal(prompt):
|
|
116
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
117
|
+
content = self.generate_content(prompt)
|
|
118
|
+
else:
|
|
119
|
+
content = [{"type": "text", "text": prompt}]
|
|
120
|
+
|
|
104
121
|
client = self.load_model(async_mode=False)
|
|
105
122
|
chat = client.chat.create(
|
|
106
|
-
model=self.
|
|
123
|
+
model=self.name,
|
|
107
124
|
temperature=self.temperature,
|
|
108
125
|
**self.generation_kwargs,
|
|
109
126
|
)
|
|
110
|
-
chat.append(user(
|
|
127
|
+
chat.append(user(content))
|
|
111
128
|
|
|
112
|
-
if schema and self.
|
|
129
|
+
if schema and self.supports_structured_outputs() is True:
|
|
113
130
|
response, structured_output = chat.parse(schema)
|
|
114
131
|
cost = self.calculate_cost(
|
|
115
132
|
response.usage.prompt_tokens,
|
|
@@ -132,22 +149,30 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
132
149
|
@retry_grok
|
|
133
150
|
async def a_generate(
|
|
134
151
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
135
|
-
) -> Tuple[Union[str,
|
|
152
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
153
|
+
|
|
136
154
|
try:
|
|
137
155
|
from xai_sdk.chat import user
|
|
138
156
|
except ImportError:
|
|
139
157
|
raise ImportError(
|
|
140
158
|
"xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
|
|
141
159
|
)
|
|
160
|
+
|
|
161
|
+
if check_if_multimodal(prompt):
|
|
162
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
163
|
+
content = self.generate_content(prompt)
|
|
164
|
+
else:
|
|
165
|
+
content = [{"type": "text", "text": prompt}]
|
|
166
|
+
|
|
142
167
|
client = self.load_model(async_mode=True)
|
|
143
168
|
chat = client.chat.create(
|
|
144
|
-
model=self.
|
|
169
|
+
model=self.name,
|
|
145
170
|
temperature=self.temperature,
|
|
146
171
|
**self.generation_kwargs,
|
|
147
172
|
)
|
|
148
|
-
chat.append(user(
|
|
173
|
+
chat.append(user(content))
|
|
149
174
|
|
|
150
|
-
if schema and self.
|
|
175
|
+
if schema and self.supports_structured_outputs() is True:
|
|
151
176
|
response, structured_output = await chat.parse(schema)
|
|
152
177
|
cost = self.calculate_cost(
|
|
153
178
|
response.usage.prompt_tokens,
|
|
@@ -167,6 +192,34 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
167
192
|
else:
|
|
168
193
|
return output, cost
|
|
169
194
|
|
|
195
|
+
def generate_content(
|
|
196
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
197
|
+
):
|
|
198
|
+
content = []
|
|
199
|
+
for element in multimodal_input:
|
|
200
|
+
if isinstance(element, str):
|
|
201
|
+
content.append({"type": "text", "text": element})
|
|
202
|
+
elif isinstance(element, MLLMImage):
|
|
203
|
+
if element.url and not element.local:
|
|
204
|
+
content.append(
|
|
205
|
+
{
|
|
206
|
+
"type": "image_url",
|
|
207
|
+
"image_url": {"url": element.url},
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
element.ensure_images_loaded()
|
|
212
|
+
data_uri = (
|
|
213
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
214
|
+
)
|
|
215
|
+
content.append(
|
|
216
|
+
{
|
|
217
|
+
"type": "image_url",
|
|
218
|
+
"image_url": {"url": data_uri},
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
return content
|
|
222
|
+
|
|
170
223
|
###############################################
|
|
171
224
|
# Utilities
|
|
172
225
|
###############################################
|
|
@@ -176,11 +229,29 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
176
229
|
input_tokens: int,
|
|
177
230
|
output_tokens: int,
|
|
178
231
|
) -> float:
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
output_cost = output_tokens * pricing["output"]
|
|
232
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
233
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
182
234
|
return input_cost + output_cost
|
|
183
235
|
|
|
236
|
+
###############################################
|
|
237
|
+
# Capabilities
|
|
238
|
+
###############################################
|
|
239
|
+
|
|
240
|
+
def supports_log_probs(self) -> Union[bool, None]:
|
|
241
|
+
return self.model_data.supports_log_probs
|
|
242
|
+
|
|
243
|
+
def supports_temperature(self) -> Union[bool, None]:
|
|
244
|
+
return self.model_data.supports_temperature
|
|
245
|
+
|
|
246
|
+
def supports_multimodal(self) -> Union[bool, None]:
|
|
247
|
+
return self.model_data.supports_multimodal
|
|
248
|
+
|
|
249
|
+
def supports_structured_outputs(self) -> Union[bool, None]:
|
|
250
|
+
return self.model_data.supports_structured_outputs
|
|
251
|
+
|
|
252
|
+
def supports_json_mode(self) -> Union[bool, None]:
|
|
253
|
+
return self.model_data.supports_json
|
|
254
|
+
|
|
184
255
|
###############################################
|
|
185
256
|
# Model
|
|
186
257
|
###############################################
|
|
@@ -198,9 +269,6 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
198
269
|
"xai_sdk is required to use GrokModel. Please install it with: pip install xai-sdk"
|
|
199
270
|
)
|
|
200
271
|
|
|
201
|
-
def get_model_name(self):
|
|
202
|
-
return f"{self.model_name}"
|
|
203
|
-
|
|
204
272
|
def _client_kwargs(self) -> Dict:
|
|
205
273
|
"""
|
|
206
274
|
If Tenacity is managing retries, disable gRPC channel retries to avoid double retry.
|
|
@@ -242,3 +310,6 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
242
310
|
kw.pop("channel_options", None)
|
|
243
311
|
return cls(**kw)
|
|
244
312
|
raise
|
|
313
|
+
|
|
314
|
+
def get_model_name(self):
|
|
315
|
+
return f"{self.name} (Grok)"
|
|
@@ -1,111 +1,101 @@
|
|
|
1
|
-
from typing import Optional, Tuple, Union, Dict
|
|
1
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
2
2
|
from openai import OpenAI, AsyncOpenAI
|
|
3
3
|
from pydantic import BaseModel, SecretStr
|
|
4
4
|
|
|
5
|
+
from deepeval.errors import DeepEvalError
|
|
5
6
|
from deepeval.config.settings import get_settings
|
|
6
7
|
from deepeval.models.retry_policy import (
|
|
7
8
|
create_retry_decorator,
|
|
8
9
|
sdk_retries_for,
|
|
9
10
|
)
|
|
10
11
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
11
|
-
from deepeval.models.utils import
|
|
12
|
+
from deepeval.models.utils import (
|
|
13
|
+
require_costs,
|
|
14
|
+
require_secret_api_key,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.test_case import MLLMImage
|
|
17
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
12
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
19
|
from deepeval.constants import ProviderSlug as PS
|
|
14
|
-
|
|
20
|
+
from deepeval.models.llms.constants import KIMI_MODELS_DATA
|
|
21
|
+
from deepeval.utils import require_param
|
|
15
22
|
|
|
16
23
|
retry_kimi = create_retry_decorator(PS.KIMI)
|
|
17
24
|
|
|
18
|
-
json_mode_models = [
|
|
19
|
-
"kimi-thinking-preview",
|
|
20
|
-
"kimi-k2-0711-preview",
|
|
21
|
-
"kimi-latest-128k",
|
|
22
|
-
"kimi-latest-32k",
|
|
23
|
-
"kimi-latest-8k",
|
|
24
|
-
]
|
|
25
|
-
|
|
26
|
-
model_pricing = {
|
|
27
|
-
"kimi-latest-8k": {
|
|
28
|
-
"input": 0.20 / 1e6,
|
|
29
|
-
"output": 2.00 / 1e6,
|
|
30
|
-
},
|
|
31
|
-
"kimi-latest-32k": {
|
|
32
|
-
"input": 1.00 / 1e6,
|
|
33
|
-
"output": 3.00 / 1e6,
|
|
34
|
-
},
|
|
35
|
-
"kimi-latest-128k": {
|
|
36
|
-
"input": 2.00 / 1e6,
|
|
37
|
-
"output": 5.00 / 1e6,
|
|
38
|
-
},
|
|
39
|
-
"kimi-k2-0711-preview": {
|
|
40
|
-
"input": 0.60 / 1e6,
|
|
41
|
-
"output": 2.50 / 1e6,
|
|
42
|
-
},
|
|
43
|
-
"kimi-thinking-preview": {
|
|
44
|
-
"input": 30 / 1e6,
|
|
45
|
-
"output": 30 / 1e6,
|
|
46
|
-
},
|
|
47
|
-
"moonshot-v1-8k": {
|
|
48
|
-
"input": 1.00 / 1e6,
|
|
49
|
-
"output": 2.00 / 1e6,
|
|
50
|
-
},
|
|
51
|
-
"moonshot-v1-32k": {
|
|
52
|
-
"input": 2.00 / 1e6,
|
|
53
|
-
"output": 3.00 / 1e6,
|
|
54
|
-
},
|
|
55
|
-
"moonshot-v1-128k": {
|
|
56
|
-
"input": 0.20 / 1e6,
|
|
57
|
-
"output": 5.00 / 1e6,
|
|
58
|
-
},
|
|
59
|
-
"moonshot-v1-8k-vision-preview": {
|
|
60
|
-
"input": 1.00 / 1e6,
|
|
61
|
-
"output": 2.00 / 1e6,
|
|
62
|
-
},
|
|
63
|
-
"moonshot-v1-32k-vision-preview": {
|
|
64
|
-
"input": 2.00 / 1e6,
|
|
65
|
-
"output": 3.00 / 1e6,
|
|
66
|
-
},
|
|
67
|
-
"moonshot-v1-128k-vision-preview": {
|
|
68
|
-
"input": 0.20 / 1e6,
|
|
69
|
-
"output": 5.00 / 1e6,
|
|
70
|
-
},
|
|
71
|
-
}
|
|
72
|
-
|
|
73
25
|
|
|
74
26
|
class KimiModel(DeepEvalBaseLLM):
|
|
75
27
|
def __init__(
|
|
76
28
|
self,
|
|
77
|
-
api_key: Optional[str] = None,
|
|
78
29
|
model: Optional[str] = None,
|
|
79
|
-
|
|
30
|
+
api_key: Optional[str] = None,
|
|
31
|
+
temperature: Optional[float] = None,
|
|
32
|
+
cost_per_input_token: Optional[float] = None,
|
|
33
|
+
cost_per_output_token: Optional[float] = None,
|
|
80
34
|
generation_kwargs: Optional[Dict] = None,
|
|
81
35
|
**kwargs,
|
|
82
36
|
):
|
|
83
37
|
settings = get_settings()
|
|
84
38
|
|
|
85
|
-
|
|
86
|
-
if model_name not in model_pricing:
|
|
87
|
-
raise ValueError(
|
|
88
|
-
f"Invalid model. Available Moonshot models: {', '.join(model_pricing.keys())}"
|
|
89
|
-
)
|
|
39
|
+
model = model or settings.MOONSHOT_MODEL_NAME
|
|
90
40
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
41
|
+
if temperature is not None:
|
|
42
|
+
temperature = float(temperature)
|
|
43
|
+
elif settings.TEMPERATURE is not None:
|
|
44
|
+
temperature = settings.TEMPERATURE
|
|
94
45
|
else:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
46
|
+
temperature = 0.0
|
|
47
|
+
|
|
48
|
+
cost_per_input_token = (
|
|
49
|
+
cost_per_input_token
|
|
50
|
+
if cost_per_input_token is not None
|
|
51
|
+
else settings.MOONSHOT_COST_PER_INPUT_TOKEN
|
|
52
|
+
)
|
|
53
|
+
cost_per_output_token = (
|
|
54
|
+
cost_per_output_token
|
|
55
|
+
if cost_per_output_token is not None
|
|
56
|
+
else settings.MOONSHOT_COST_PER_OUTPUT_TOKEN
|
|
57
|
+
)
|
|
98
58
|
|
|
99
59
|
if api_key is not None:
|
|
100
60
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
101
|
-
self.api_key: SecretStr
|
|
61
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
102
62
|
else:
|
|
103
63
|
self.api_key = settings.MOONSHOT_API_KEY
|
|
104
64
|
|
|
65
|
+
# validation
|
|
66
|
+
model = require_param(
|
|
67
|
+
model,
|
|
68
|
+
provider_label="KimiModel",
|
|
69
|
+
env_var_name="MOONSHOT_MODEL_NAME",
|
|
70
|
+
param_hint="model",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if temperature < 0:
|
|
74
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
75
|
+
|
|
76
|
+
self.model_data = KIMI_MODELS_DATA.get(model)
|
|
77
|
+
self.temperature = temperature
|
|
78
|
+
|
|
79
|
+
cost_per_input_token, cost_per_output_token = require_costs(
|
|
80
|
+
self.model_data,
|
|
81
|
+
model,
|
|
82
|
+
"MOONSHOT_COST_PER_INPUT_TOKEN",
|
|
83
|
+
"MOONSHOT_COST_PER_OUTPUT_TOKEN",
|
|
84
|
+
cost_per_input_token,
|
|
85
|
+
cost_per_output_token,
|
|
86
|
+
)
|
|
87
|
+
self.model_data.input_price = float(cost_per_input_token)
|
|
88
|
+
self.model_data.output_price = float(cost_per_output_token)
|
|
89
|
+
|
|
105
90
|
self.base_url = "https://api.moonshot.cn/v1"
|
|
91
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
106
92
|
self.kwargs = kwargs
|
|
107
|
-
self.
|
|
108
|
-
|
|
93
|
+
self.kwargs.pop("temperature", None)
|
|
94
|
+
|
|
95
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
96
|
+
self.generation_kwargs.pop("temperature", None)
|
|
97
|
+
|
|
98
|
+
super().__init__(model)
|
|
109
99
|
|
|
110
100
|
###############################################
|
|
111
101
|
# Other generate functions
|
|
@@ -114,12 +104,19 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
114
104
|
@retry_kimi
|
|
115
105
|
def generate(
|
|
116
106
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
117
|
-
) -> Tuple[Union[str,
|
|
107
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
108
|
+
|
|
109
|
+
if check_if_multimodal(prompt):
|
|
110
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
111
|
+
content = self.generate_content(prompt)
|
|
112
|
+
else:
|
|
113
|
+
content = [{"type": "text", "text": prompt}]
|
|
114
|
+
|
|
118
115
|
client = self.load_model(async_mode=False)
|
|
119
|
-
if schema and self.
|
|
116
|
+
if schema and self.supports_json_mode() is True:
|
|
120
117
|
completion = client.chat.completions.create(
|
|
121
|
-
model=self.
|
|
122
|
-
messages=[{"role": "user", "content":
|
|
118
|
+
model=self.name,
|
|
119
|
+
messages=[{"role": "user", "content": content}],
|
|
123
120
|
response_format={"type": "json_object"},
|
|
124
121
|
temperature=self.temperature,
|
|
125
122
|
**self.generation_kwargs,
|
|
@@ -134,8 +131,8 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
134
131
|
return schema.model_validate(json_output), cost
|
|
135
132
|
|
|
136
133
|
completion = client.chat.completions.create(
|
|
137
|
-
model=self.
|
|
138
|
-
messages=[{"role": "user", "content":
|
|
134
|
+
model=self.name,
|
|
135
|
+
messages=[{"role": "user", "content": content}],
|
|
139
136
|
**self.generation_kwargs,
|
|
140
137
|
)
|
|
141
138
|
output = completion.choices[0].message.content
|
|
@@ -152,12 +149,19 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
152
149
|
@retry_kimi
|
|
153
150
|
async def a_generate(
|
|
154
151
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
155
|
-
) -> Tuple[Union[str,
|
|
152
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
153
|
+
|
|
154
|
+
if check_if_multimodal(prompt):
|
|
155
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
156
|
+
content = self.generate_content(prompt)
|
|
157
|
+
else:
|
|
158
|
+
content = [{"type": "text", "text": prompt}]
|
|
159
|
+
|
|
156
160
|
client = self.load_model(async_mode=True)
|
|
157
|
-
if schema and self.
|
|
161
|
+
if schema and self.supports_json_mode() is True:
|
|
158
162
|
completion = await client.chat.completions.create(
|
|
159
|
-
model=self.
|
|
160
|
-
messages=[{"role": "user", "content":
|
|
163
|
+
model=self.name,
|
|
164
|
+
messages=[{"role": "user", "content": content}],
|
|
161
165
|
response_format={"type": "json_object"},
|
|
162
166
|
temperature=self.temperature,
|
|
163
167
|
**self.generation_kwargs,
|
|
@@ -172,8 +176,8 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
172
176
|
return schema.model_validate(json_output), cost
|
|
173
177
|
|
|
174
178
|
completion = await client.chat.completions.create(
|
|
175
|
-
model=self.
|
|
176
|
-
messages=[{"role": "user", "content":
|
|
179
|
+
model=self.name,
|
|
180
|
+
messages=[{"role": "user", "content": content}],
|
|
177
181
|
**self.generation_kwargs,
|
|
178
182
|
)
|
|
179
183
|
output = completion.choices[0].message.content
|
|
@@ -187,6 +191,34 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
187
191
|
else:
|
|
188
192
|
return output, cost
|
|
189
193
|
|
|
194
|
+
def generate_content(
|
|
195
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
196
|
+
):
|
|
197
|
+
content = []
|
|
198
|
+
for element in multimodal_input:
|
|
199
|
+
if isinstance(element, str):
|
|
200
|
+
content.append({"type": "text", "text": element})
|
|
201
|
+
elif isinstance(element, MLLMImage):
|
|
202
|
+
if element.url and not element.local:
|
|
203
|
+
content.append(
|
|
204
|
+
{
|
|
205
|
+
"type": "image_url",
|
|
206
|
+
"image_url": {"url": element.url},
|
|
207
|
+
}
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
element.ensure_images_loaded()
|
|
211
|
+
data_uri = (
|
|
212
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
213
|
+
)
|
|
214
|
+
content.append(
|
|
215
|
+
{
|
|
216
|
+
"type": "image_url",
|
|
217
|
+
"image_url": {"url": data_uri},
|
|
218
|
+
}
|
|
219
|
+
)
|
|
220
|
+
return content
|
|
221
|
+
|
|
190
222
|
###############################################
|
|
191
223
|
# Utilities
|
|
192
224
|
###############################################
|
|
@@ -196,11 +228,29 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
196
228
|
input_tokens: int,
|
|
197
229
|
output_tokens: int,
|
|
198
230
|
) -> float:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
output_cost = output_tokens * pricing["output"]
|
|
231
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
232
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
202
233
|
return input_cost + output_cost
|
|
203
234
|
|
|
235
|
+
###############################################
|
|
236
|
+
# Capabilities
|
|
237
|
+
###############################################
|
|
238
|
+
|
|
239
|
+
def supports_log_probs(self) -> Union[bool, None]:
|
|
240
|
+
return self.model_data.supports_log_probs
|
|
241
|
+
|
|
242
|
+
def supports_temperature(self) -> Union[bool, None]:
|
|
243
|
+
return self.model_data.supports_temperature
|
|
244
|
+
|
|
245
|
+
def supports_multimodal(self) -> Union[bool, None]:
|
|
246
|
+
return self.model_data.supports_multimodal
|
|
247
|
+
|
|
248
|
+
def supports_structured_outputs(self) -> Union[bool, None]:
|
|
249
|
+
return self.model_data.supports_structured_outputs
|
|
250
|
+
|
|
251
|
+
def supports_json_mode(self) -> Union[bool, None]:
|
|
252
|
+
return self.model_data.supports_json
|
|
253
|
+
|
|
204
254
|
###############################################
|
|
205
255
|
# Model
|
|
206
256
|
###############################################
|
|
@@ -244,4 +294,4 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
244
294
|
raise
|
|
245
295
|
|
|
246
296
|
def get_model_name(self):
|
|
247
|
-
return f"{self.
|
|
297
|
+
return f"{self.name} (KIMI)"
|