deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -3,10 +3,13 @@ import requests
|
|
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
from pydantic import AnyUrl, SecretStr
|
|
5
5
|
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
6
7
|
from deepeval.config.settings import get_settings
|
|
7
8
|
from deepeval.models.utils import (
|
|
8
9
|
require_secret_api_key,
|
|
9
10
|
)
|
|
11
|
+
from deepeval.test_case import MLLMImage
|
|
12
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
10
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
11
14
|
from deepeval.utils import require_param
|
|
12
15
|
|
|
@@ -29,16 +32,9 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
29
32
|
settings = get_settings()
|
|
30
33
|
model = model or settings.PORTKEY_MODEL_NAME
|
|
31
34
|
|
|
32
|
-
self.name = require_param(
|
|
33
|
-
model,
|
|
34
|
-
provider_label="Portkey",
|
|
35
|
-
env_var_name="PORTKEY_MODEL_NAME",
|
|
36
|
-
param_hint="model",
|
|
37
|
-
)
|
|
38
|
-
|
|
39
35
|
if api_key is not None:
|
|
40
36
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
41
|
-
self.api_key: SecretStr
|
|
37
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
42
38
|
else:
|
|
43
39
|
self.api_key = settings.PORTKEY_API_KEY
|
|
44
40
|
|
|
@@ -47,6 +43,16 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
47
43
|
elif settings.PORTKEY_BASE_URL is not None:
|
|
48
44
|
base_url = str(settings.PORTKEY_BASE_URL).rstrip("/")
|
|
49
45
|
|
|
46
|
+
provider = provider or settings.PORTKEY_PROVIDER_NAME
|
|
47
|
+
|
|
48
|
+
# validation
|
|
49
|
+
model = require_param(
|
|
50
|
+
model,
|
|
51
|
+
provider_label="Portkey",
|
|
52
|
+
env_var_name="PORTKEY_MODEL_NAME",
|
|
53
|
+
param_hint="model",
|
|
54
|
+
)
|
|
55
|
+
|
|
50
56
|
self.base_url = require_param(
|
|
51
57
|
base_url,
|
|
52
58
|
provider_label="Portkey",
|
|
@@ -54,7 +60,6 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
54
60
|
param_hint="base_url",
|
|
55
61
|
)
|
|
56
62
|
|
|
57
|
-
provider = provider or settings.PORTKEY_PROVIDER_NAME
|
|
58
63
|
self.provider = require_param(
|
|
59
64
|
provider,
|
|
60
65
|
provider_label="Portkey",
|
|
@@ -64,6 +69,7 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
64
69
|
# Keep sanitized kwargs for client call to strip legacy keys
|
|
65
70
|
self.kwargs = kwargs
|
|
66
71
|
self.generation_kwargs = generation_kwargs or {}
|
|
72
|
+
super().__init__(model)
|
|
67
73
|
|
|
68
74
|
def _headers(self) -> Dict[str, str]:
|
|
69
75
|
api_key = require_secret_api_key(
|
|
@@ -82,18 +88,51 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
82
88
|
return headers
|
|
83
89
|
|
|
84
90
|
def _payload(self, prompt: str) -> Dict[str, Any]:
|
|
91
|
+
if check_if_multimodal(prompt):
|
|
92
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
93
|
+
content = self.generate_content(prompt)
|
|
94
|
+
else:
|
|
95
|
+
content = [{"type": "text", "text": prompt}]
|
|
85
96
|
payload = {
|
|
86
97
|
"model": self.name,
|
|
87
|
-
"messages": [{"role": "user", "content":
|
|
98
|
+
"messages": [{"role": "user", "content": content}],
|
|
88
99
|
}
|
|
89
100
|
if self.generation_kwargs:
|
|
90
101
|
payload.update(self.generation_kwargs)
|
|
91
102
|
return payload
|
|
92
103
|
|
|
104
|
+
def generate_content(
|
|
105
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
106
|
+
):
|
|
107
|
+
content = []
|
|
108
|
+
for element in multimodal_input:
|
|
109
|
+
if isinstance(element, str):
|
|
110
|
+
content.append({"type": "text", "text": element})
|
|
111
|
+
elif isinstance(element, MLLMImage):
|
|
112
|
+
if element.url and not element.local:
|
|
113
|
+
content.append(
|
|
114
|
+
{
|
|
115
|
+
"type": "image_url",
|
|
116
|
+
"image_url": {"url": element.url},
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
element.ensure_images_loaded()
|
|
121
|
+
data_uri = (
|
|
122
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
123
|
+
)
|
|
124
|
+
content.append(
|
|
125
|
+
{
|
|
126
|
+
"type": "image_url",
|
|
127
|
+
"image_url": {"url": data_uri},
|
|
128
|
+
}
|
|
129
|
+
)
|
|
130
|
+
return content
|
|
131
|
+
|
|
93
132
|
def _extract_content(self, data: Dict[str, Any]) -> str:
|
|
94
133
|
choices: Union[List[Dict[str, Any]], None] = data.get("choices")
|
|
95
134
|
if not choices:
|
|
96
|
-
raise
|
|
135
|
+
raise DeepEvalError("Portkey response did not include any choices.")
|
|
97
136
|
message = choices[0].get("message", {})
|
|
98
137
|
content: Union[str, List[Dict[str, Any]], None] = message.get("content")
|
|
99
138
|
if isinstance(content, str):
|
|
@@ -109,7 +148,7 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
109
148
|
f"{self.base_url}/chat/completions",
|
|
110
149
|
json=self._payload(prompt),
|
|
111
150
|
headers=self._headers(),
|
|
112
|
-
timeout=
|
|
151
|
+
timeout=_request_timeout_seconds(),
|
|
113
152
|
)
|
|
114
153
|
response.raise_for_status()
|
|
115
154
|
except requests.HTTPError as error:
|
|
@@ -118,11 +157,11 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
118
157
|
body = response.json()
|
|
119
158
|
except Exception:
|
|
120
159
|
body = response.text
|
|
121
|
-
raise
|
|
160
|
+
raise DeepEvalError(
|
|
122
161
|
f"Portkey request failed with status {response.status_code}: {body}"
|
|
123
162
|
) from error
|
|
124
163
|
except requests.RequestException as error:
|
|
125
|
-
raise
|
|
164
|
+
raise DeepEvalError(f"Portkey request failed: {error}") from error
|
|
126
165
|
return self._extract_content(response.json())
|
|
127
166
|
|
|
128
167
|
async def a_generate(self, prompt: str) -> str:
|
|
@@ -132,11 +171,11 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
132
171
|
f"{self.base_url}/chat/completions",
|
|
133
172
|
json=self._payload(prompt),
|
|
134
173
|
headers=self._headers(),
|
|
135
|
-
timeout=
|
|
174
|
+
timeout=_request_timeout_seconds(),
|
|
136
175
|
) as response:
|
|
137
176
|
if response.status >= 400:
|
|
138
177
|
body = await response.text()
|
|
139
|
-
raise
|
|
178
|
+
raise DeepEvalError(
|
|
140
179
|
f"Portkey request failed with status {response.status}: {body}"
|
|
141
180
|
)
|
|
142
181
|
data = await response.json()
|
|
@@ -147,3 +186,6 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
147
186
|
|
|
148
187
|
def get_model_name(self):
|
|
149
188
|
return f"{self.name} (Portkey)"
|
|
189
|
+
|
|
190
|
+
def supports_multimodal(self):
|
|
191
|
+
return True
|
deepeval/models/llms/utils.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
from typing import Dict
|
|
1
|
+
from typing import Dict
|
|
2
2
|
import re
|
|
3
3
|
import json
|
|
4
4
|
import asyncio
|
|
5
5
|
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
7
|
+
|
|
8
|
+
|
|
6
9
|
MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
|
|
7
10
|
|
|
8
11
|
|
|
@@ -20,7 +23,7 @@ def trim_and_load_json(
|
|
|
20
23
|
return json.loads(jsonStr)
|
|
21
24
|
except json.JSONDecodeError:
|
|
22
25
|
error_str = "Evaluation LLM outputted an invalid JSON. Please use a better evaluation model."
|
|
23
|
-
raise
|
|
26
|
+
raise DeepEvalError(error_str)
|
|
24
27
|
except Exception as e:
|
|
25
28
|
raise Exception(f"An unexpected error occurred: {str(e)}")
|
|
26
29
|
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -87,6 +87,8 @@ def set_outer_deadline(seconds: float | None):
|
|
|
87
87
|
call, which must be passed to `reset_outer_deadline` to restore the
|
|
88
88
|
previous value.
|
|
89
89
|
"""
|
|
90
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
91
|
+
return _OUTER_DEADLINE.set(None)
|
|
90
92
|
if seconds and seconds > 0:
|
|
91
93
|
return _OUTER_DEADLINE.set(time.monotonic() + seconds)
|
|
92
94
|
return _OUTER_DEADLINE.set(None)
|
|
@@ -131,11 +133,10 @@ def resolve_effective_attempt_timeout():
|
|
|
131
133
|
float: Seconds to use for the inner per-attempt timeout. `0` means
|
|
132
134
|
disable inner timeout and rely on the outer budget instead.
|
|
133
135
|
"""
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
)
|
|
136
|
+
settings = get_settings()
|
|
137
|
+
per_attempt = float(settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
137
138
|
# 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
|
|
138
|
-
if per_attempt <= 0:
|
|
139
|
+
if settings.DEEPEVAL_DISABLE_TIMEOUTS or per_attempt <= 0:
|
|
139
140
|
return 0
|
|
140
141
|
# If we do have a positive per-attempt, use up to remaining outer budget.
|
|
141
142
|
rem = _remaining_budget()
|
|
@@ -557,7 +558,11 @@ def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
|
557
558
|
BaseException: If `func` raises, the same exception is re-raised with its
|
|
558
559
|
original traceback.
|
|
559
560
|
"""
|
|
560
|
-
if
|
|
561
|
+
if (
|
|
562
|
+
get_settings().DEEPEVAL_DISABLE_TIMEOUTS
|
|
563
|
+
or not timeout_seconds
|
|
564
|
+
or timeout_seconds <= 0
|
|
565
|
+
):
|
|
561
566
|
return func(*args, **kwargs)
|
|
562
567
|
|
|
563
568
|
# try to respect the global cap on concurrent timeout workers
|
deepeval/models/utils.py
CHANGED
|
@@ -8,7 +8,7 @@ from deepeval.errors import DeepEvalError
|
|
|
8
8
|
logger = logging.getLogger(__name__)
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def parse_model_name(model_name: Optional[str] = None) -> str:
|
|
11
|
+
def parse_model_name(model_name: Optional[str] = None) -> Optional[str]:
|
|
12
12
|
"""Extract base model name from provider-prefixed format.
|
|
13
13
|
|
|
14
14
|
This function is useful for extracting the actual model name from a
|
|
@@ -32,9 +32,9 @@ def parse_model_name(model_name: Optional[str] = None) -> str:
|
|
|
32
32
|
if model_name is None:
|
|
33
33
|
return None
|
|
34
34
|
|
|
35
|
-
if "/" in model_name:
|
|
36
|
-
|
|
37
|
-
|
|
35
|
+
# if "/" in model_name:
|
|
36
|
+
# _, parsed_model_name = model_name.split("/", 1)
|
|
37
|
+
# return parsed_model_name
|
|
38
38
|
return model_name
|
|
39
39
|
|
|
40
40
|
|
|
@@ -80,6 +80,58 @@ def require_secret_api_key(
|
|
|
80
80
|
return api_key
|
|
81
81
|
|
|
82
82
|
|
|
83
|
+
def require_costs(
|
|
84
|
+
model_data,
|
|
85
|
+
model_name: str,
|
|
86
|
+
input_token_envvar: str,
|
|
87
|
+
output_token_envvar: str,
|
|
88
|
+
cost_per_input_token: Optional[float] = None,
|
|
89
|
+
cost_per_output_token: Optional[float] = None,
|
|
90
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
|
91
|
+
"""
|
|
92
|
+
Validates and returns the cost parameters (input and output tokens) for a model.
|
|
93
|
+
|
|
94
|
+
Arguments:
|
|
95
|
+
- model_data: The model's data object, which should contain `input_price` and `output_price`.
|
|
96
|
+
- model_name: The model name used for error messaging.
|
|
97
|
+
- cost_per_input_token: The input token cost provided during model initialization (optional).
|
|
98
|
+
- cost_per_output_token: The output token cost provided during model initialization (optional).
|
|
99
|
+
- input_token_envvar: The environment variable name for input cost.
|
|
100
|
+
- output_token_envvar: The environment variable name for output cost.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
- A tuple of validated values (input_cost, output_cost). If the values are provided, they are returned.
|
|
104
|
+
If not provided, they are fetched from settings or environment variables.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def validate_cost(
|
|
108
|
+
value: Optional[float], envvar_name: str
|
|
109
|
+
) -> Optional[float]:
|
|
110
|
+
"""Helper function to validate the cost values."""
|
|
111
|
+
if value is not None and value < 0:
|
|
112
|
+
raise DeepEvalError(f"{envvar_name} must be >= 0.")
|
|
113
|
+
return value
|
|
114
|
+
|
|
115
|
+
# Validate provided token costs
|
|
116
|
+
cost_per_input_token = validate_cost(
|
|
117
|
+
cost_per_input_token, input_token_envvar
|
|
118
|
+
)
|
|
119
|
+
cost_per_output_token = validate_cost(
|
|
120
|
+
cost_per_output_token, output_token_envvar
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# If model data doesn't have pricing, use provided values or environment variables
|
|
124
|
+
if model_data.input_price is None or model_data.output_price is None:
|
|
125
|
+
if cost_per_input_token is None or cost_per_output_token is None:
|
|
126
|
+
return None, None
|
|
127
|
+
|
|
128
|
+
# Return the validated cost values as a tuple
|
|
129
|
+
return cost_per_input_token, cost_per_output_token
|
|
130
|
+
|
|
131
|
+
# If no custom cost values are provided, return model's default cost values
|
|
132
|
+
return model_data.input_price, model_data.output_price
|
|
133
|
+
|
|
134
|
+
|
|
83
135
|
def normalize_kwargs_and_extract_aliases(
|
|
84
136
|
provider_label: str,
|
|
85
137
|
kwargs: Dict[str, Any],
|
|
@@ -20,6 +20,7 @@ from deepeval.simulator.template import (
|
|
|
20
20
|
ConversationSimulatorTemplate,
|
|
21
21
|
)
|
|
22
22
|
from deepeval.models import DeepEvalBaseLLM
|
|
23
|
+
from deepeval.metrics.utils import MULTIMODAL_SUPPORTED_MODELS
|
|
23
24
|
from deepeval.simulator.schema import (
|
|
24
25
|
SimulatedInput,
|
|
25
26
|
ConversationCompletion,
|
|
@@ -94,6 +95,26 @@ class ConversationSimulator:
|
|
|
94
95
|
)
|
|
95
96
|
)
|
|
96
97
|
else:
|
|
98
|
+
multimodal = any(
|
|
99
|
+
[golden.multimodal for golden in conversational_goldens]
|
|
100
|
+
)
|
|
101
|
+
if multimodal:
|
|
102
|
+
if (
|
|
103
|
+
not self.simulator_model
|
|
104
|
+
or not self.simulator_model.supports_multimodal()
|
|
105
|
+
):
|
|
106
|
+
if (
|
|
107
|
+
self.simulator_model
|
|
108
|
+
and type(self.simulator_model)
|
|
109
|
+
in MULTIMODAL_SUPPORTED_MODELS
|
|
110
|
+
):
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
|
|
117
|
+
)
|
|
97
118
|
conversational_test_cases: List[ConversationalTestCase] = []
|
|
98
119
|
for conversation_index, golden in enumerate(
|
|
99
120
|
conversational_goldens
|
|
@@ -124,6 +145,28 @@ class ConversationSimulator:
|
|
|
124
145
|
progress: Optional[Progress] = None,
|
|
125
146
|
pbar_id: Optional[int] = None,
|
|
126
147
|
) -> List[ConversationalTestCase]:
|
|
148
|
+
|
|
149
|
+
multimodal = any(
|
|
150
|
+
[golden.multimodal for golden in conversational_goldens]
|
|
151
|
+
)
|
|
152
|
+
if multimodal:
|
|
153
|
+
if (
|
|
154
|
+
not self.simulator_model
|
|
155
|
+
or not self.simulator_model.supports_multimodal()
|
|
156
|
+
):
|
|
157
|
+
if (
|
|
158
|
+
self.simulator_model
|
|
159
|
+
and type(self.simulator_model)
|
|
160
|
+
in MULTIMODAL_SUPPORTED_MODELS
|
|
161
|
+
):
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
|
|
168
|
+
)
|
|
169
|
+
|
|
127
170
|
self.simulation_cost = 0 if self.using_native_model else None
|
|
128
171
|
|
|
129
172
|
async def simulate_conversations(
|
|
@@ -471,7 +514,9 @@ class ConversationSimulator:
|
|
|
471
514
|
):
|
|
472
515
|
if not self.run_remote:
|
|
473
516
|
conversation_history = json.dumps(
|
|
474
|
-
[t.model_dump() for t in turns],
|
|
517
|
+
[t.model_dump() for t in turns],
|
|
518
|
+
indent=4,
|
|
519
|
+
ensure_ascii=False,
|
|
475
520
|
)
|
|
476
521
|
prompt = self.template.stop_simulation(
|
|
477
522
|
conversation_history, golden.expected_outcome
|
|
@@ -516,7 +561,9 @@ class ConversationSimulator:
|
|
|
516
561
|
):
|
|
517
562
|
if not self.run_remote:
|
|
518
563
|
conversation_history = json.dumps(
|
|
519
|
-
[t.model_dump() for t in turns],
|
|
564
|
+
[t.model_dump() for t in turns],
|
|
565
|
+
indent=4,
|
|
566
|
+
ensure_ascii=False,
|
|
520
567
|
)
|
|
521
568
|
prompt = self.template.stop_simulation(
|
|
522
569
|
conversation_history, golden.expected_outcome
|
deepeval/simulator/template.py
CHANGED
|
@@ -7,6 +7,13 @@ from deepeval.test_case import Turn
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ConversationSimulatorTemplate:
|
|
10
|
+
multimodal_rules = """
|
|
11
|
+
--- MULTIMODAL INPUT RULES ---
|
|
12
|
+
- Treat image content as factual evidence.
|
|
13
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
14
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
15
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
16
|
+
"""
|
|
10
17
|
|
|
11
18
|
@staticmethod
|
|
12
19
|
def simulate_first_user_turn(
|
|
@@ -23,6 +30,8 @@ class ConversationSimulatorTemplate:
|
|
|
23
30
|
3. Avoid providing excessive details upfront; the goal is to initiate the conversation and build rapport, not to solve it in the first message.
|
|
24
31
|
4. The message should be concise, ideally no more than 1-3 sentences.
|
|
25
32
|
|
|
33
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
34
|
+
|
|
26
35
|
IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`, where the value is the generated opening message in {language}.
|
|
27
36
|
|
|
28
37
|
Example Language: english
|
|
@@ -48,7 +57,9 @@ class ConversationSimulatorTemplate:
|
|
|
48
57
|
language: str,
|
|
49
58
|
) -> str:
|
|
50
59
|
previous_conversation = json.dumps(
|
|
51
|
-
[t.model_dump() for t in turns],
|
|
60
|
+
[t.model_dump() for t in turns],
|
|
61
|
+
indent=4,
|
|
62
|
+
ensure_ascii=False,
|
|
52
63
|
)
|
|
53
64
|
prompt = textwrap.dedent(
|
|
54
65
|
f"""
|
|
@@ -61,6 +72,8 @@ class ConversationSimulatorTemplate:
|
|
|
61
72
|
3. Keep the tone consistent with the previous user inputs.
|
|
62
73
|
4. The generated user input should be concise, ideally no more than 1-2 sentences.
|
|
63
74
|
|
|
75
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
76
|
+
|
|
64
77
|
IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`,
|
|
65
78
|
where the value is the generated user input in {language}.
|
|
66
79
|
|
|
@@ -101,6 +114,8 @@ class ConversationSimulatorTemplate:
|
|
|
101
114
|
2. If the expected outcome has been met, mark the conversation as complete.
|
|
102
115
|
3. If not, mark it as incomplete and briefly describe what remains to be done.
|
|
103
116
|
|
|
117
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
118
|
+
|
|
104
119
|
IMPORTANT: The output must be formatted as a JSON object with two keys:
|
|
105
120
|
`is_complete` (a boolean) and `reason` (a string).
|
|
106
121
|
|
|
@@ -25,7 +25,7 @@ from deepeval.metrics.utils import (
|
|
|
25
25
|
from deepeval.progress_context import synthesizer_progress_context
|
|
26
26
|
from deepeval.models import DeepEvalBaseLLM
|
|
27
27
|
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
28
|
-
from deepeval.synthesizer.types import
|
|
28
|
+
from deepeval.synthesizer.types import Evolution, PromptEvolution
|
|
29
29
|
from deepeval.synthesizer.templates import (
|
|
30
30
|
EvolutionTemplate,
|
|
31
31
|
SynthesizerTemplate,
|
|
@@ -246,7 +246,7 @@ class Synthesizer:
|
|
|
246
246
|
)
|
|
247
247
|
if self.cost_tracking and self.using_native_model:
|
|
248
248
|
print(f"💰 API cost: {self.synthesis_cost:.6f}")
|
|
249
|
-
if _send_data
|
|
249
|
+
if _send_data:
|
|
250
250
|
pass
|
|
251
251
|
remove_pbars(
|
|
252
252
|
progress,
|
|
@@ -546,7 +546,7 @@ class Synthesizer:
|
|
|
546
546
|
# Remove pbar if not from docs
|
|
547
547
|
remove_pbars(progress, [pbar_id]) if _progress is None else None
|
|
548
548
|
|
|
549
|
-
if _send_data
|
|
549
|
+
if _send_data:
|
|
550
550
|
pass
|
|
551
551
|
if _reset_cost and self.cost_tracking and self.using_native_model:
|
|
552
552
|
print(f"💰 API cost: {self.synthesis_cost:.6f}")
|
|
@@ -567,7 +567,8 @@ class Synthesizer:
|
|
|
567
567
|
if _reset_cost:
|
|
568
568
|
self.synthetic_goldens = []
|
|
569
569
|
self.synthesis_cost = 0 if self.using_native_model else None
|
|
570
|
-
|
|
570
|
+
context_semaphore = asyncio.Semaphore(self.max_concurrent)
|
|
571
|
+
worker_semaphore = asyncio.Semaphore(self.max_concurrent)
|
|
571
572
|
goldens: List[Golden] = []
|
|
572
573
|
|
|
573
574
|
with synthesizer_progress_context(
|
|
@@ -586,9 +587,9 @@ class Synthesizer:
|
|
|
586
587
|
):
|
|
587
588
|
tasks = [
|
|
588
589
|
self.task_wrapper(
|
|
589
|
-
|
|
590
|
+
context_semaphore,
|
|
590
591
|
self._a_generate_from_context,
|
|
591
|
-
semaphore=
|
|
592
|
+
semaphore=worker_semaphore,
|
|
592
593
|
context=context,
|
|
593
594
|
goldens=goldens,
|
|
594
595
|
include_expected_output=include_expected_output,
|
|
@@ -965,7 +966,7 @@ class Synthesizer:
|
|
|
965
966
|
|
|
966
967
|
# Wrap up Synthesis
|
|
967
968
|
self.synthetic_goldens.extend(goldens)
|
|
968
|
-
if _send_data
|
|
969
|
+
if _send_data:
|
|
969
970
|
pass
|
|
970
971
|
return goldens
|
|
971
972
|
|
|
@@ -1023,7 +1024,7 @@ class Synthesizer:
|
|
|
1023
1024
|
source_files.append(golden.source_file)
|
|
1024
1025
|
|
|
1025
1026
|
# Extract styles from goldens if not already set
|
|
1026
|
-
if self.set_styling_config
|
|
1027
|
+
if not self.set_styling_config:
|
|
1027
1028
|
example_inputs = random.sample(
|
|
1028
1029
|
[golden.input for golden in goldens], min(len(goldens), 10)
|
|
1029
1030
|
)
|
|
@@ -1069,7 +1070,7 @@ class Synthesizer:
|
|
|
1069
1070
|
source_files.append(golden.source_file)
|
|
1070
1071
|
|
|
1071
1072
|
# Extract styles from goldens if not already set
|
|
1072
|
-
if self.set_styling_config
|
|
1073
|
+
if not self.set_styling_config:
|
|
1073
1074
|
example_inputs = random.sample(
|
|
1074
1075
|
[golden.input for golden in goldens], min(len(goldens), 10)
|
|
1075
1076
|
)
|
|
@@ -1637,7 +1638,7 @@ class Synthesizer:
|
|
|
1637
1638
|
)
|
|
1638
1639
|
if self.cost_tracking and self.using_native_model:
|
|
1639
1640
|
print(f"💰 API cost: {self.synthesis_cost:.6f}")
|
|
1640
|
-
if _send_data
|
|
1641
|
+
if _send_data:
|
|
1641
1642
|
pass
|
|
1642
1643
|
remove_pbars(
|
|
1643
1644
|
progress,
|
|
@@ -1949,7 +1950,7 @@ class Synthesizer:
|
|
|
1949
1950
|
# Remove pbar if not from docs
|
|
1950
1951
|
remove_pbars(progress, [pbar_id]) if _progress is None else None
|
|
1951
1952
|
|
|
1952
|
-
if _send_data
|
|
1953
|
+
if _send_data:
|
|
1953
1954
|
pass
|
|
1954
1955
|
if _reset_cost and self.cost_tracking and self.using_native_model:
|
|
1955
1956
|
print(f"💰 API cost: {self.synthesis_cost:.6f}")
|
|
@@ -1970,7 +1971,8 @@ class Synthesizer:
|
|
|
1970
1971
|
if _reset_cost:
|
|
1971
1972
|
self.synthetic_conversational_goldens = []
|
|
1972
1973
|
self.synthesis_cost = 0 if self.using_native_model else None
|
|
1973
|
-
|
|
1974
|
+
context_semaphore = asyncio.Semaphore(self.max_concurrent)
|
|
1975
|
+
worker_semaphore = asyncio.Semaphore(self.max_concurrent)
|
|
1974
1976
|
goldens: List[ConversationalGolden] = []
|
|
1975
1977
|
|
|
1976
1978
|
with synthesizer_progress_context(
|
|
@@ -1989,9 +1991,9 @@ class Synthesizer:
|
|
|
1989
1991
|
):
|
|
1990
1992
|
tasks = [
|
|
1991
1993
|
self.task_wrapper(
|
|
1992
|
-
|
|
1994
|
+
context_semaphore,
|
|
1993
1995
|
self._a_generate_conversational_from_context,
|
|
1994
|
-
semaphore=
|
|
1996
|
+
semaphore=worker_semaphore,
|
|
1995
1997
|
context=context,
|
|
1996
1998
|
goldens=goldens,
|
|
1997
1999
|
include_expected_outcome=include_expected_outcome,
|
|
@@ -2335,7 +2337,7 @@ class Synthesizer:
|
|
|
2335
2337
|
|
|
2336
2338
|
# Wrap up Synthesis
|
|
2337
2339
|
self.synthetic_conversational_goldens.extend(goldens)
|
|
2338
|
-
if _send_data
|
|
2340
|
+
if _send_data:
|
|
2339
2341
|
pass
|
|
2340
2342
|
return goldens
|
|
2341
2343
|
|
|
@@ -2567,7 +2569,7 @@ class Synthesizer:
|
|
|
2567
2569
|
contexts.append(golden.context)
|
|
2568
2570
|
|
|
2569
2571
|
# Extract styles from conversational goldens if not already set
|
|
2570
|
-
if self.set_conversational_styling_config
|
|
2572
|
+
if not self.set_conversational_styling_config:
|
|
2571
2573
|
example_scenarios = random.sample(
|
|
2572
2574
|
[golden.scenario for golden in goldens],
|
|
2573
2575
|
min(len(goldens), 10),
|
|
@@ -2612,7 +2614,7 @@ class Synthesizer:
|
|
|
2612
2614
|
contexts.append(golden.context)
|
|
2613
2615
|
|
|
2614
2616
|
# Extract styles from conversational goldens if not already set
|
|
2615
|
-
if self.set_conversational_styling_config
|
|
2617
|
+
if not self.set_conversational_styling_config:
|
|
2616
2618
|
example_scenarios = random.sample(
|
|
2617
2619
|
[golden.scenario for golden in goldens], min(len(goldens), 10)
|
|
2618
2620
|
)
|
deepeval/test_case/api.py
CHANGED
|
@@ -12,7 +12,6 @@ from deepeval.test_case import (
|
|
|
12
12
|
ConversationalTestCase,
|
|
13
13
|
Turn,
|
|
14
14
|
)
|
|
15
|
-
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
16
15
|
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
17
16
|
|
|
18
17
|
|
|
@@ -33,7 +32,6 @@ def create_api_test_case(
|
|
|
33
32
|
trace: Optional[TraceApi] = None,
|
|
34
33
|
index: Optional[int] = None,
|
|
35
34
|
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
36
|
-
from deepeval.utils import convert_to_multi_modal_array
|
|
37
35
|
|
|
38
36
|
if isinstance(test_case, ConversationalTestCase):
|
|
39
37
|
order = (
|
|
@@ -61,8 +59,10 @@ def create_api_test_case(
|
|
|
61
59
|
context=test_case.context,
|
|
62
60
|
tags=test_case.tags,
|
|
63
61
|
comments=test_case.comments,
|
|
62
|
+
imagesMapping=test_case._get_images_mapping(),
|
|
64
63
|
additionalMetadata=test_case.additional_metadata,
|
|
65
64
|
)
|
|
65
|
+
|
|
66
66
|
api_test_case.turns = [
|
|
67
67
|
create_api_turn(
|
|
68
68
|
turn=turn,
|
|
@@ -86,48 +86,27 @@ def create_api_test_case(
|
|
|
86
86
|
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
87
87
|
metrics_data = []
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
112
|
-
api_test_case = LLMApiTestCase(
|
|
113
|
-
name=name,
|
|
114
|
-
input=test_case.input,
|
|
115
|
-
actualOutput=test_case.actual_output,
|
|
116
|
-
expectedOutput=test_case.expected_output,
|
|
117
|
-
retrievalContext=test_case.retrieval_context,
|
|
118
|
-
context=test_case.context,
|
|
119
|
-
imagesMapping=_MLLM_IMAGE_REGISTRY,
|
|
120
|
-
toolsCalled=test_case.tools_called,
|
|
121
|
-
expectedTools=test_case.expected_tools,
|
|
122
|
-
tokenCost=test_case.token_cost,
|
|
123
|
-
completionTime=test_case.completion_time,
|
|
124
|
-
success=success,
|
|
125
|
-
metricsData=metrics_data,
|
|
126
|
-
runDuration=None,
|
|
127
|
-
evaluationCost=None,
|
|
128
|
-
order=order,
|
|
129
|
-
additionalMetadata=test_case.additional_metadata,
|
|
130
|
-
comments=test_case.comments,
|
|
131
|
-
)
|
|
89
|
+
api_test_case = LLMApiTestCase(
|
|
90
|
+
name=name,
|
|
91
|
+
input=test_case.input,
|
|
92
|
+
actualOutput=test_case.actual_output,
|
|
93
|
+
expectedOutput=test_case.expected_output,
|
|
94
|
+
retrievalContext=test_case.retrieval_context,
|
|
95
|
+
context=test_case.context,
|
|
96
|
+
imagesMapping=test_case._get_images_mapping(),
|
|
97
|
+
toolsCalled=test_case.tools_called,
|
|
98
|
+
expectedTools=test_case.expected_tools,
|
|
99
|
+
tokenCost=test_case.token_cost,
|
|
100
|
+
completionTime=test_case.completion_time,
|
|
101
|
+
success=success,
|
|
102
|
+
metricsData=metrics_data,
|
|
103
|
+
runDuration=None,
|
|
104
|
+
evaluationCost=None,
|
|
105
|
+
order=order,
|
|
106
|
+
additionalMetadata=test_case.additional_metadata,
|
|
107
|
+
comments=test_case.comments,
|
|
108
|
+
tags=test_case.tags,
|
|
109
|
+
trace=trace,
|
|
110
|
+
)
|
|
132
111
|
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
133
112
|
return api_test_case
|