deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -9,10 +9,17 @@ from tenacity import (
|
|
|
9
9
|
RetryCallState,
|
|
10
10
|
)
|
|
11
11
|
|
|
12
|
+
from deepeval.errors import DeepEvalError
|
|
12
13
|
from deepeval.config.settings import get_settings
|
|
13
|
-
from deepeval.models.utils import
|
|
14
|
+
from deepeval.models.utils import (
|
|
15
|
+
require_secret_api_key,
|
|
16
|
+
normalize_kwargs_and_extract_aliases,
|
|
17
|
+
)
|
|
18
|
+
from deepeval.test_case import MLLMImage
|
|
19
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
14
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
15
21
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
22
|
+
from deepeval.utils import require_param
|
|
16
23
|
|
|
17
24
|
|
|
18
25
|
def log_retry_error(retry_state: RetryCallState):
|
|
@@ -27,6 +34,10 @@ retryable_exceptions = (
|
|
|
27
34
|
Exception, # LiteLLM handles specific exceptions internally
|
|
28
35
|
)
|
|
29
36
|
|
|
37
|
+
_ALIAS_MAP = {
|
|
38
|
+
"base_url": ["api_base"],
|
|
39
|
+
}
|
|
40
|
+
|
|
30
41
|
|
|
31
42
|
class LiteLLMModel(DeepEvalBaseLLM):
|
|
32
43
|
EXP_BASE: int = 2
|
|
@@ -39,24 +50,29 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
39
50
|
self,
|
|
40
51
|
model: Optional[str] = None,
|
|
41
52
|
api_key: Optional[str] = None,
|
|
42
|
-
|
|
43
|
-
temperature: float =
|
|
53
|
+
base_url: Optional[str] = None,
|
|
54
|
+
temperature: Optional[float] = None,
|
|
44
55
|
generation_kwargs: Optional[Dict] = None,
|
|
45
56
|
**kwargs,
|
|
46
57
|
):
|
|
47
|
-
|
|
48
58
|
settings = get_settings()
|
|
59
|
+
normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
|
|
60
|
+
"LiteLLMModel",
|
|
61
|
+
kwargs,
|
|
62
|
+
_ALIAS_MAP,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# re-map depricated keywords to re-named positional args
|
|
66
|
+
if base_url is None and "base_url" in alias_values:
|
|
67
|
+
base_url = alias_values["base_url"]
|
|
68
|
+
|
|
49
69
|
# Get model name from parameter or key file
|
|
50
|
-
|
|
51
|
-
if not model_name:
|
|
52
|
-
raise ValueError(
|
|
53
|
-
"Model name must be provided either through parameter or set-litellm command"
|
|
54
|
-
)
|
|
70
|
+
model = model or settings.LITELLM_MODEL_NAME
|
|
55
71
|
|
|
56
72
|
# Get API key from parameter, or settings
|
|
57
73
|
if api_key is not None:
|
|
58
74
|
# keep it secret, keep it safe from serializings, logging and aolike
|
|
59
|
-
self.api_key: SecretStr
|
|
75
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
60
76
|
else:
|
|
61
77
|
self.api_key = (
|
|
62
78
|
settings.LITELLM_API_KEY
|
|
@@ -67,8 +83,8 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
67
83
|
)
|
|
68
84
|
|
|
69
85
|
# Get API base from parameter, key file, or environment variable
|
|
70
|
-
|
|
71
|
-
|
|
86
|
+
base_url = (
|
|
87
|
+
base_url
|
|
72
88
|
or (
|
|
73
89
|
str(settings.LITELLM_API_BASE)
|
|
74
90
|
if settings.LITELLM_API_BASE is not None
|
|
@@ -80,14 +96,37 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
80
96
|
else None
|
|
81
97
|
)
|
|
82
98
|
)
|
|
99
|
+
self.base_url = (
|
|
100
|
+
str(base_url).rstrip("/") if base_url is not None else None
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if temperature is not None:
|
|
104
|
+
temperature = float(temperature)
|
|
105
|
+
elif settings.TEMPERATURE is not None:
|
|
106
|
+
temperature = settings.TEMPERATURE
|
|
107
|
+
else:
|
|
108
|
+
temperature = 0.0
|
|
109
|
+
|
|
110
|
+
# validation
|
|
111
|
+
model = require_param(
|
|
112
|
+
model,
|
|
113
|
+
provider_label="LiteLLMModel",
|
|
114
|
+
env_var_name="LITELLM_MODEL_NAME",
|
|
115
|
+
param_hint="model",
|
|
116
|
+
)
|
|
83
117
|
|
|
84
118
|
if temperature < 0:
|
|
85
|
-
raise
|
|
119
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
86
120
|
self.temperature = temperature
|
|
87
|
-
|
|
88
|
-
self.
|
|
121
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
122
|
+
self.kwargs = normalized_kwargs
|
|
123
|
+
self.kwargs.pop("temperature", None)
|
|
124
|
+
|
|
125
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
126
|
+
self.generation_kwargs.pop("temperature", None)
|
|
127
|
+
|
|
89
128
|
self.evaluation_cost = 0.0 # Initialize cost to 0.0
|
|
90
|
-
super().__init__(
|
|
129
|
+
super().__init__(model)
|
|
91
130
|
|
|
92
131
|
@retry(
|
|
93
132
|
wait=wait_exponential_jitter(
|
|
@@ -99,12 +138,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
99
138
|
)
|
|
100
139
|
def generate(
|
|
101
140
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
102
|
-
) -> Union[str,
|
|
141
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
142
|
+
|
|
103
143
|
from litellm import completion
|
|
104
144
|
|
|
145
|
+
if check_if_multimodal(prompt):
|
|
146
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
147
|
+
content = self.generate_content(prompt)
|
|
148
|
+
else:
|
|
149
|
+
content = [{"type": "text", "text": prompt}]
|
|
150
|
+
|
|
105
151
|
completion_params = {
|
|
106
|
-
"model": self.
|
|
107
|
-
"messages": [{"role": "user", "content":
|
|
152
|
+
"model": self.name,
|
|
153
|
+
"messages": [{"role": "user", "content": content}],
|
|
108
154
|
"temperature": self.temperature,
|
|
109
155
|
}
|
|
110
156
|
|
|
@@ -112,12 +158,12 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
112
158
|
api_key = require_secret_api_key(
|
|
113
159
|
self.api_key,
|
|
114
160
|
provider_label="LiteLLM",
|
|
115
|
-
env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
|
|
161
|
+
env_var_name="LITELLM_API_KEY|LITELLM_PROXY_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
|
|
116
162
|
param_hint="`api_key` to LiteLLMModel(...)",
|
|
117
163
|
)
|
|
118
164
|
completion_params["api_key"] = api_key
|
|
119
|
-
if self.
|
|
120
|
-
completion_params["api_base"] = self.
|
|
165
|
+
if self.base_url:
|
|
166
|
+
completion_params["api_base"] = self.base_url
|
|
121
167
|
|
|
122
168
|
# Add schema if provided
|
|
123
169
|
if schema:
|
|
@@ -154,12 +200,19 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
154
200
|
)
|
|
155
201
|
async def a_generate(
|
|
156
202
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
157
|
-
) -> Union[str,
|
|
203
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
204
|
+
|
|
158
205
|
from litellm import acompletion
|
|
159
206
|
|
|
207
|
+
if check_if_multimodal(prompt):
|
|
208
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
209
|
+
content = self.generate_content(prompt)
|
|
210
|
+
else:
|
|
211
|
+
content = [{"type": "text", "text": prompt}]
|
|
212
|
+
|
|
160
213
|
completion_params = {
|
|
161
|
-
"model": self.
|
|
162
|
-
"messages": [{"role": "user", "content":
|
|
214
|
+
"model": self.name,
|
|
215
|
+
"messages": [{"role": "user", "content": content}],
|
|
163
216
|
"temperature": self.temperature,
|
|
164
217
|
}
|
|
165
218
|
|
|
@@ -171,8 +224,8 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
171
224
|
param_hint="`api_key` to LiteLLMModel(...)",
|
|
172
225
|
)
|
|
173
226
|
completion_params["api_key"] = api_key
|
|
174
|
-
if self.
|
|
175
|
-
completion_params["api_base"] = self.
|
|
227
|
+
if self.base_url:
|
|
228
|
+
completion_params["api_base"] = self.base_url
|
|
176
229
|
|
|
177
230
|
# Add schema if provided
|
|
178
231
|
if schema:
|
|
@@ -221,12 +274,17 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
221
274
|
env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
|
|
222
275
|
param_hint="`api_key` to LiteLLMModel(...)",
|
|
223
276
|
)
|
|
277
|
+
if check_if_multimodal(prompt):
|
|
278
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
279
|
+
content = self.generate_content(prompt)
|
|
280
|
+
else:
|
|
281
|
+
content = [{"type": "text", "text": prompt}]
|
|
224
282
|
completion_params = {
|
|
225
|
-
"model": self.
|
|
226
|
-
"messages": [{"role": "user", "content":
|
|
283
|
+
"model": self.name,
|
|
284
|
+
"messages": [{"role": "user", "content": content}],
|
|
227
285
|
"temperature": self.temperature,
|
|
228
286
|
"api_key": api_key,
|
|
229
|
-
"api_base": self.
|
|
287
|
+
"api_base": self.base_url,
|
|
230
288
|
"logprobs": True,
|
|
231
289
|
"top_logprobs": top_logprobs,
|
|
232
290
|
}
|
|
@@ -262,12 +320,17 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
262
320
|
env_var_name="LITELLM_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY|GOOGLE_API_KEY",
|
|
263
321
|
param_hint="`api_key` to LiteLLMModel(...)",
|
|
264
322
|
)
|
|
323
|
+
if check_if_multimodal(prompt):
|
|
324
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
325
|
+
content = self.generate_content(prompt)
|
|
326
|
+
else:
|
|
327
|
+
content = [{"type": "text", "text": prompt}]
|
|
265
328
|
completion_params = {
|
|
266
|
-
"model": self.
|
|
267
|
-
"messages": [{"role": "user", "content":
|
|
329
|
+
"model": self.name,
|
|
330
|
+
"messages": [{"role": "user", "content": content}],
|
|
268
331
|
"temperature": self.temperature,
|
|
269
332
|
"api_key": api_key,
|
|
270
|
-
"api_base": self.
|
|
333
|
+
"api_base": self.base_url,
|
|
271
334
|
"logprobs": True,
|
|
272
335
|
"top_logprobs": top_logprobs,
|
|
273
336
|
}
|
|
@@ -302,12 +365,12 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
302
365
|
param_hint="`api_key` to LiteLLMModel(...)",
|
|
303
366
|
)
|
|
304
367
|
completion_params = {
|
|
305
|
-
"model": self.
|
|
368
|
+
"model": self.name,
|
|
306
369
|
"messages": [{"role": "user", "content": prompt}],
|
|
307
370
|
"temperature": temperature,
|
|
308
371
|
"n": n,
|
|
309
372
|
"api_key": api_key,
|
|
310
|
-
"api_base": self.
|
|
373
|
+
"api_base": self.base_url,
|
|
311
374
|
}
|
|
312
375
|
completion_params.update(self.kwargs)
|
|
313
376
|
|
|
@@ -320,6 +383,34 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
320
383
|
logging.error(f"Error in LiteLLM generate_samples: {e}")
|
|
321
384
|
raise
|
|
322
385
|
|
|
386
|
+
def generate_content(
|
|
387
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
388
|
+
):
|
|
389
|
+
content = []
|
|
390
|
+
for element in multimodal_input:
|
|
391
|
+
if isinstance(element, str):
|
|
392
|
+
content.append({"type": "text", "text": element})
|
|
393
|
+
elif isinstance(element, MLLMImage):
|
|
394
|
+
if element.url and not element.local:
|
|
395
|
+
content.append(
|
|
396
|
+
{
|
|
397
|
+
"type": "image_url",
|
|
398
|
+
"image_url": {"url": element.url},
|
|
399
|
+
}
|
|
400
|
+
)
|
|
401
|
+
else:
|
|
402
|
+
element.ensure_images_loaded()
|
|
403
|
+
data_uri = (
|
|
404
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
405
|
+
)
|
|
406
|
+
content.append(
|
|
407
|
+
{
|
|
408
|
+
"type": "image_url",
|
|
409
|
+
"image_url": {"url": data_uri},
|
|
410
|
+
}
|
|
411
|
+
)
|
|
412
|
+
return content
|
|
413
|
+
|
|
323
414
|
def calculate_cost(self, response: Any) -> float:
|
|
324
415
|
"""Calculate the cost of the response based on token usage."""
|
|
325
416
|
try:
|
|
@@ -353,8 +444,8 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
353
444
|
def get_model_name(self) -> str:
|
|
354
445
|
from litellm import get_llm_provider
|
|
355
446
|
|
|
356
|
-
provider = get_llm_provider(self.
|
|
357
|
-
return f"{self.
|
|
447
|
+
provider = get_llm_provider(self.name)
|
|
448
|
+
return f"{self.name} ({provider})"
|
|
358
449
|
|
|
359
450
|
def load_model(self, async_mode: bool = False):
|
|
360
451
|
"""
|
|
@@ -369,3 +460,6 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
369
460
|
None as LiteLLM handles client creation internally
|
|
370
461
|
"""
|
|
371
462
|
return None
|
|
463
|
+
|
|
464
|
+
def supports_multimodal(self):
|
|
465
|
+
return True
|
|
@@ -1,17 +1,26 @@
|
|
|
1
|
-
from typing import Optional, Tuple, Union, Dict
|
|
1
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
2
2
|
from pydantic import BaseModel, SecretStr
|
|
3
3
|
from openai import OpenAI, AsyncOpenAI
|
|
4
4
|
from openai.types.chat import ChatCompletion
|
|
5
5
|
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
6
7
|
from deepeval.config.settings import get_settings
|
|
7
8
|
from deepeval.models.retry_policy import (
|
|
8
9
|
create_retry_decorator,
|
|
9
10
|
sdk_retries_for,
|
|
10
11
|
)
|
|
11
12
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
12
|
-
from deepeval.models.utils import
|
|
13
|
+
from deepeval.models.utils import (
|
|
14
|
+
require_secret_api_key,
|
|
15
|
+
)
|
|
13
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
14
17
|
from deepeval.constants import ProviderSlug as PS
|
|
18
|
+
from deepeval.test_case import MLLMImage
|
|
19
|
+
from deepeval.utils import (
|
|
20
|
+
check_if_multimodal,
|
|
21
|
+
convert_to_multi_modal_array,
|
|
22
|
+
require_param,
|
|
23
|
+
)
|
|
15
24
|
|
|
16
25
|
|
|
17
26
|
# consistent retry rules
|
|
@@ -22,47 +31,75 @@ class LocalModel(DeepEvalBaseLLM):
|
|
|
22
31
|
def __init__(
|
|
23
32
|
self,
|
|
24
33
|
model: Optional[str] = None,
|
|
25
|
-
base_url: Optional[str] = None,
|
|
26
34
|
api_key: Optional[str] = None,
|
|
27
|
-
|
|
35
|
+
base_url: Optional[str] = None,
|
|
36
|
+
temperature: Optional[float] = None,
|
|
28
37
|
format: Optional[str] = None,
|
|
29
38
|
generation_kwargs: Optional[Dict] = None,
|
|
30
39
|
**kwargs,
|
|
31
40
|
):
|
|
32
41
|
settings = get_settings()
|
|
33
42
|
|
|
34
|
-
|
|
43
|
+
model = model or settings.LOCAL_MODEL_NAME
|
|
35
44
|
if api_key is not None:
|
|
36
|
-
|
|
37
|
-
self.local_model_api_key: SecretStr | None = SecretStr(api_key)
|
|
45
|
+
self.local_model_api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
38
46
|
else:
|
|
39
47
|
self.local_model_api_key = settings.LOCAL_MODEL_API_KEY
|
|
40
48
|
|
|
49
|
+
base_url = (
|
|
50
|
+
base_url if base_url is not None else settings.LOCAL_MODEL_BASE_URL
|
|
51
|
+
)
|
|
41
52
|
self.base_url = (
|
|
42
|
-
base_url
|
|
43
|
-
or settings.LOCAL_MODEL_BASE_URL
|
|
44
|
-
and str(settings.LOCAL_MODEL_BASE_URL)
|
|
53
|
+
str(base_url).rstrip("/") if base_url is not None else None
|
|
45
54
|
)
|
|
46
55
|
self.format = format or settings.LOCAL_MODEL_FORMAT
|
|
56
|
+
|
|
57
|
+
if temperature is not None:
|
|
58
|
+
temperature = float(temperature)
|
|
59
|
+
elif settings.TEMPERATURE is not None:
|
|
60
|
+
temperature = settings.TEMPERATURE
|
|
61
|
+
else:
|
|
62
|
+
temperature = 0.0
|
|
63
|
+
|
|
64
|
+
# validation
|
|
65
|
+
model = require_param(
|
|
66
|
+
model,
|
|
67
|
+
provider_label="LocalModel",
|
|
68
|
+
env_var_name="LOCAL_MODEL_NAME",
|
|
69
|
+
param_hint="model",
|
|
70
|
+
)
|
|
71
|
+
|
|
47
72
|
if temperature < 0:
|
|
48
|
-
raise
|
|
73
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
49
74
|
self.temperature = temperature
|
|
75
|
+
|
|
50
76
|
self.kwargs = kwargs
|
|
51
|
-
self.
|
|
52
|
-
|
|
77
|
+
self.kwargs.pop("temperature", None)
|
|
78
|
+
|
|
79
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
80
|
+
self.generation_kwargs.pop("temperature", None)
|
|
81
|
+
|
|
82
|
+
super().__init__(model)
|
|
53
83
|
|
|
54
84
|
###############################################
|
|
55
|
-
#
|
|
85
|
+
# Generate functions
|
|
56
86
|
###############################################
|
|
57
87
|
|
|
58
88
|
@retry_local
|
|
59
89
|
def generate(
|
|
60
90
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
61
|
-
) -> Tuple[Union[str,
|
|
91
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
92
|
+
|
|
93
|
+
if check_if_multimodal(prompt):
|
|
94
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
95
|
+
content = self.generate_content(prompt)
|
|
96
|
+
else:
|
|
97
|
+
content = prompt
|
|
98
|
+
|
|
62
99
|
client = self.load_model(async_mode=False)
|
|
63
100
|
response: ChatCompletion = client.chat.completions.create(
|
|
64
|
-
model=self.
|
|
65
|
-
messages=[{"role": "user", "content":
|
|
101
|
+
model=self.name,
|
|
102
|
+
messages=[{"role": "user", "content": content}],
|
|
66
103
|
temperature=self.temperature,
|
|
67
104
|
**self.generation_kwargs,
|
|
68
105
|
)
|
|
@@ -77,11 +114,18 @@ class LocalModel(DeepEvalBaseLLM):
|
|
|
77
114
|
@retry_local
|
|
78
115
|
async def a_generate(
|
|
79
116
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
80
|
-
) -> Tuple[Union[str,
|
|
117
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
118
|
+
|
|
119
|
+
if check_if_multimodal(prompt):
|
|
120
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
121
|
+
content = self.generate_content(prompt)
|
|
122
|
+
else:
|
|
123
|
+
content = prompt
|
|
124
|
+
|
|
81
125
|
client = self.load_model(async_mode=True)
|
|
82
126
|
response: ChatCompletion = await client.chat.completions.create(
|
|
83
|
-
model=self.
|
|
84
|
-
messages=[{"role": "user", "content":
|
|
127
|
+
model=self.name,
|
|
128
|
+
messages=[{"role": "user", "content": content}],
|
|
85
129
|
temperature=self.temperature,
|
|
86
130
|
**self.generation_kwargs,
|
|
87
131
|
)
|
|
@@ -93,12 +137,72 @@ class LocalModel(DeepEvalBaseLLM):
|
|
|
93
137
|
else:
|
|
94
138
|
return res_content, 0.0
|
|
95
139
|
|
|
140
|
+
def generate_content(
|
|
141
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
142
|
+
):
|
|
143
|
+
"""
|
|
144
|
+
Converts multimodal input into OpenAI-compatible format.
|
|
145
|
+
Uses data URIs for all images since we can't guarantee local servers support URL fetching.
|
|
146
|
+
"""
|
|
147
|
+
prompt = []
|
|
148
|
+
for element in multimodal_input:
|
|
149
|
+
if isinstance(element, str):
|
|
150
|
+
prompt.append({"type": "text", "text": element})
|
|
151
|
+
elif isinstance(element, MLLMImage):
|
|
152
|
+
# For local servers, use data URIs for both remote and local images
|
|
153
|
+
# Most local servers don't support fetching external URLs
|
|
154
|
+
if element.url and not element.local:
|
|
155
|
+
import requests
|
|
156
|
+
import base64
|
|
157
|
+
|
|
158
|
+
settings = get_settings()
|
|
159
|
+
try:
|
|
160
|
+
response = requests.get(
|
|
161
|
+
element.url,
|
|
162
|
+
timeout=(
|
|
163
|
+
settings.MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS,
|
|
164
|
+
settings.MEDIA_IMAGE_READ_TIMEOUT_SECONDS,
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
response.raise_for_status()
|
|
168
|
+
|
|
169
|
+
# Get mime type from response
|
|
170
|
+
mime_type = response.headers.get(
|
|
171
|
+
"content-type", element.mimeType or "image/jpeg"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Encode to base64
|
|
175
|
+
b64_data = base64.b64encode(response.content).decode(
|
|
176
|
+
"utf-8"
|
|
177
|
+
)
|
|
178
|
+
data_uri = f"data:{mime_type};base64,{b64_data}"
|
|
179
|
+
|
|
180
|
+
except Exception as e:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
f"Failed to fetch remote image {element.url}: {e}"
|
|
183
|
+
)
|
|
184
|
+
else:
|
|
185
|
+
element.ensure_images_loaded()
|
|
186
|
+
mime_type = element.mimeType or "image/jpeg"
|
|
187
|
+
data_uri = f"data:{mime_type};base64,{element.dataBase64}"
|
|
188
|
+
|
|
189
|
+
prompt.append(
|
|
190
|
+
{
|
|
191
|
+
"type": "image_url",
|
|
192
|
+
"image_url": {"url": data_uri},
|
|
193
|
+
}
|
|
194
|
+
)
|
|
195
|
+
return prompt
|
|
196
|
+
|
|
96
197
|
###############################################
|
|
97
198
|
# Model
|
|
98
199
|
###############################################
|
|
99
200
|
|
|
100
201
|
def get_model_name(self):
|
|
101
|
-
return f"{self.
|
|
202
|
+
return f"{self.name} (Local Model)"
|
|
203
|
+
|
|
204
|
+
def supports_multimodal(self):
|
|
205
|
+
return True
|
|
102
206
|
|
|
103
207
|
def load_model(self, async_mode: bool = False):
|
|
104
208
|
if not async_mode:
|