deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -3,10 +3,13 @@ import requests
|
|
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
from pydantic import AnyUrl, SecretStr
|
|
5
5
|
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
6
7
|
from deepeval.config.settings import get_settings
|
|
7
8
|
from deepeval.models.utils import (
|
|
8
9
|
require_secret_api_key,
|
|
9
10
|
)
|
|
11
|
+
from deepeval.test_case import MLLMImage
|
|
12
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
10
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
11
14
|
from deepeval.utils import require_param
|
|
12
15
|
|
|
@@ -29,16 +32,9 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
29
32
|
settings = get_settings()
|
|
30
33
|
model = model or settings.PORTKEY_MODEL_NAME
|
|
31
34
|
|
|
32
|
-
self.name = require_param(
|
|
33
|
-
model,
|
|
34
|
-
provider_label="Portkey",
|
|
35
|
-
env_var_name="PORTKEY_MODEL_NAME",
|
|
36
|
-
param_hint="model",
|
|
37
|
-
)
|
|
38
|
-
|
|
39
35
|
if api_key is not None:
|
|
40
36
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
41
|
-
self.api_key: SecretStr
|
|
37
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
42
38
|
else:
|
|
43
39
|
self.api_key = settings.PORTKEY_API_KEY
|
|
44
40
|
|
|
@@ -47,6 +43,16 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
47
43
|
elif settings.PORTKEY_BASE_URL is not None:
|
|
48
44
|
base_url = str(settings.PORTKEY_BASE_URL).rstrip("/")
|
|
49
45
|
|
|
46
|
+
provider = provider or settings.PORTKEY_PROVIDER_NAME
|
|
47
|
+
|
|
48
|
+
# validation
|
|
49
|
+
model = require_param(
|
|
50
|
+
model,
|
|
51
|
+
provider_label="Portkey",
|
|
52
|
+
env_var_name="PORTKEY_MODEL_NAME",
|
|
53
|
+
param_hint="model",
|
|
54
|
+
)
|
|
55
|
+
|
|
50
56
|
self.base_url = require_param(
|
|
51
57
|
base_url,
|
|
52
58
|
provider_label="Portkey",
|
|
@@ -54,7 +60,6 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
54
60
|
param_hint="base_url",
|
|
55
61
|
)
|
|
56
62
|
|
|
57
|
-
provider = provider or settings.PORTKEY_PROVIDER_NAME
|
|
58
63
|
self.provider = require_param(
|
|
59
64
|
provider,
|
|
60
65
|
provider_label="Portkey",
|
|
@@ -64,6 +69,7 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
64
69
|
# Keep sanitized kwargs for client call to strip legacy keys
|
|
65
70
|
self.kwargs = kwargs
|
|
66
71
|
self.generation_kwargs = generation_kwargs or {}
|
|
72
|
+
super().__init__(model)
|
|
67
73
|
|
|
68
74
|
def _headers(self) -> Dict[str, str]:
|
|
69
75
|
api_key = require_secret_api_key(
|
|
@@ -82,18 +88,51 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
82
88
|
return headers
|
|
83
89
|
|
|
84
90
|
def _payload(self, prompt: str) -> Dict[str, Any]:
|
|
91
|
+
if check_if_multimodal(prompt):
|
|
92
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
93
|
+
content = self.generate_content(prompt)
|
|
94
|
+
else:
|
|
95
|
+
content = [{"type": "text", "text": prompt}]
|
|
85
96
|
payload = {
|
|
86
97
|
"model": self.name,
|
|
87
|
-
"messages": [{"role": "user", "content":
|
|
98
|
+
"messages": [{"role": "user", "content": content}],
|
|
88
99
|
}
|
|
89
100
|
if self.generation_kwargs:
|
|
90
101
|
payload.update(self.generation_kwargs)
|
|
91
102
|
return payload
|
|
92
103
|
|
|
104
|
+
def generate_content(
|
|
105
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
106
|
+
):
|
|
107
|
+
content = []
|
|
108
|
+
for element in multimodal_input:
|
|
109
|
+
if isinstance(element, str):
|
|
110
|
+
content.append({"type": "text", "text": element})
|
|
111
|
+
elif isinstance(element, MLLMImage):
|
|
112
|
+
if element.url and not element.local:
|
|
113
|
+
content.append(
|
|
114
|
+
{
|
|
115
|
+
"type": "image_url",
|
|
116
|
+
"image_url": {"url": element.url},
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
element.ensure_images_loaded()
|
|
121
|
+
data_uri = (
|
|
122
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
123
|
+
)
|
|
124
|
+
content.append(
|
|
125
|
+
{
|
|
126
|
+
"type": "image_url",
|
|
127
|
+
"image_url": {"url": data_uri},
|
|
128
|
+
}
|
|
129
|
+
)
|
|
130
|
+
return content
|
|
131
|
+
|
|
93
132
|
def _extract_content(self, data: Dict[str, Any]) -> str:
|
|
94
133
|
choices: Union[List[Dict[str, Any]], None] = data.get("choices")
|
|
95
134
|
if not choices:
|
|
96
|
-
raise
|
|
135
|
+
raise DeepEvalError("Portkey response did not include any choices.")
|
|
97
136
|
message = choices[0].get("message", {})
|
|
98
137
|
content: Union[str, List[Dict[str, Any]], None] = message.get("content")
|
|
99
138
|
if isinstance(content, str):
|
|
@@ -109,7 +148,7 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
109
148
|
f"{self.base_url}/chat/completions",
|
|
110
149
|
json=self._payload(prompt),
|
|
111
150
|
headers=self._headers(),
|
|
112
|
-
timeout=
|
|
151
|
+
timeout=_request_timeout_seconds(),
|
|
113
152
|
)
|
|
114
153
|
response.raise_for_status()
|
|
115
154
|
except requests.HTTPError as error:
|
|
@@ -118,11 +157,11 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
118
157
|
body = response.json()
|
|
119
158
|
except Exception:
|
|
120
159
|
body = response.text
|
|
121
|
-
raise
|
|
160
|
+
raise DeepEvalError(
|
|
122
161
|
f"Portkey request failed with status {response.status_code}: {body}"
|
|
123
162
|
) from error
|
|
124
163
|
except requests.RequestException as error:
|
|
125
|
-
raise
|
|
164
|
+
raise DeepEvalError(f"Portkey request failed: {error}") from error
|
|
126
165
|
return self._extract_content(response.json())
|
|
127
166
|
|
|
128
167
|
async def a_generate(self, prompt: str) -> str:
|
|
@@ -132,11 +171,11 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
132
171
|
f"{self.base_url}/chat/completions",
|
|
133
172
|
json=self._payload(prompt),
|
|
134
173
|
headers=self._headers(),
|
|
135
|
-
timeout=
|
|
174
|
+
timeout=_request_timeout_seconds(),
|
|
136
175
|
) as response:
|
|
137
176
|
if response.status >= 400:
|
|
138
177
|
body = await response.text()
|
|
139
|
-
raise
|
|
178
|
+
raise DeepEvalError(
|
|
140
179
|
f"Portkey request failed with status {response.status}: {body}"
|
|
141
180
|
)
|
|
142
181
|
data = await response.json()
|
|
@@ -147,3 +186,6 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
147
186
|
|
|
148
187
|
def get_model_name(self):
|
|
149
188
|
return f"{self.name} (Portkey)"
|
|
189
|
+
|
|
190
|
+
def supports_multimodal(self):
|
|
191
|
+
return True
|
deepeval/models/llms/utils.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
from typing import Dict
|
|
1
|
+
from typing import Dict
|
|
2
2
|
import re
|
|
3
3
|
import json
|
|
4
4
|
import asyncio
|
|
5
5
|
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
7
|
+
|
|
8
|
+
|
|
6
9
|
MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
|
|
7
10
|
|
|
8
11
|
|
|
@@ -20,7 +23,7 @@ def trim_and_load_json(
|
|
|
20
23
|
return json.loads(jsonStr)
|
|
21
24
|
except json.JSONDecodeError:
|
|
22
25
|
error_str = "Evaluation LLM outputted an invalid JSON. Please use a better evaluation model."
|
|
23
|
-
raise
|
|
26
|
+
raise DeepEvalError(error_str)
|
|
24
27
|
except Exception as e:
|
|
25
28
|
raise Exception(f"An unexpected error occurred: {str(e)}")
|
|
26
29
|
|
deepeval/models/utils.py
CHANGED
|
@@ -8,7 +8,7 @@ from deepeval.errors import DeepEvalError
|
|
|
8
8
|
logger = logging.getLogger(__name__)
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def parse_model_name(model_name: Optional[str] = None) -> str:
|
|
11
|
+
def parse_model_name(model_name: Optional[str] = None) -> Optional[str]:
|
|
12
12
|
"""Extract base model name from provider-prefixed format.
|
|
13
13
|
|
|
14
14
|
This function is useful for extracting the actual model name from a
|
|
@@ -32,9 +32,9 @@ def parse_model_name(model_name: Optional[str] = None) -> str:
|
|
|
32
32
|
if model_name is None:
|
|
33
33
|
return None
|
|
34
34
|
|
|
35
|
-
if "/" in model_name:
|
|
36
|
-
|
|
37
|
-
|
|
35
|
+
# if "/" in model_name:
|
|
36
|
+
# _, parsed_model_name = model_name.split("/", 1)
|
|
37
|
+
# return parsed_model_name
|
|
38
38
|
return model_name
|
|
39
39
|
|
|
40
40
|
|
|
@@ -80,6 +80,62 @@ def require_secret_api_key(
|
|
|
80
80
|
return api_key
|
|
81
81
|
|
|
82
82
|
|
|
83
|
+
def require_costs(
|
|
84
|
+
model_data,
|
|
85
|
+
model_name: str,
|
|
86
|
+
input_token_envvar: str,
|
|
87
|
+
output_token_envvar: str,
|
|
88
|
+
cost_per_input_token: Optional[float] = None,
|
|
89
|
+
cost_per_output_token: Optional[float] = None,
|
|
90
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
|
91
|
+
"""
|
|
92
|
+
Validates and returns the cost parameters (input and output tokens) for a model.
|
|
93
|
+
|
|
94
|
+
Arguments:
|
|
95
|
+
- model_data: The model's data object, which should contain `input_price` and `output_price`.
|
|
96
|
+
- model_name: The model name used for error messaging.
|
|
97
|
+
- cost_per_input_token: The input token cost provided during model initialization (optional).
|
|
98
|
+
- cost_per_output_token: The output token cost provided during model initialization (optional).
|
|
99
|
+
- input_token_envvar: The environment variable name for input cost.
|
|
100
|
+
- output_token_envvar: The environment variable name for output cost.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
- A tuple of validated values (input_cost, output_cost). If the values are provided, they are returned.
|
|
104
|
+
If not provided, they are fetched from settings or environment variables.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def validate_cost(
|
|
108
|
+
value: Optional[float], envvar_name: str
|
|
109
|
+
) -> Optional[float]:
|
|
110
|
+
"""Helper function to validate the cost values."""
|
|
111
|
+
if value is not None and value < 0:
|
|
112
|
+
raise DeepEvalError(f"{envvar_name} must be >= 0.")
|
|
113
|
+
return value
|
|
114
|
+
|
|
115
|
+
# Validate provided token costs
|
|
116
|
+
cost_per_input_token = validate_cost(
|
|
117
|
+
cost_per_input_token, input_token_envvar
|
|
118
|
+
)
|
|
119
|
+
cost_per_output_token = validate_cost(
|
|
120
|
+
cost_per_output_token, output_token_envvar
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# If model data doesn't have pricing, use provided values or environment variables
|
|
124
|
+
if model_data.input_price is None or model_data.output_price is None:
|
|
125
|
+
if cost_per_input_token is None or cost_per_output_token is None:
|
|
126
|
+
raise DeepEvalError(
|
|
127
|
+
f"No pricing available for `{model_name}`. "
|
|
128
|
+
f"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `{model_name}`, "
|
|
129
|
+
f"or set {input_token_envvar} and {output_token_envvar} environment variables."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Return the validated cost values as a tuple
|
|
133
|
+
return cost_per_input_token, cost_per_output_token
|
|
134
|
+
|
|
135
|
+
# If no custom cost values are provided, return model's default cost values
|
|
136
|
+
return model_data.input_price, model_data.output_price
|
|
137
|
+
|
|
138
|
+
|
|
83
139
|
def normalize_kwargs_and_extract_aliases(
|
|
84
140
|
provider_label: str,
|
|
85
141
|
kwargs: Dict[str, Any],
|
|
@@ -20,6 +20,7 @@ from deepeval.simulator.template import (
|
|
|
20
20
|
ConversationSimulatorTemplate,
|
|
21
21
|
)
|
|
22
22
|
from deepeval.models import DeepEvalBaseLLM
|
|
23
|
+
from deepeval.metrics.utils import MULTIMODAL_SUPPORTED_MODELS
|
|
23
24
|
from deepeval.simulator.schema import (
|
|
24
25
|
SimulatedInput,
|
|
25
26
|
ConversationCompletion,
|
|
@@ -94,6 +95,26 @@ class ConversationSimulator:
|
|
|
94
95
|
)
|
|
95
96
|
)
|
|
96
97
|
else:
|
|
98
|
+
multimodal = any(
|
|
99
|
+
[golden.multimodal for golden in conversational_goldens]
|
|
100
|
+
)
|
|
101
|
+
if multimodal:
|
|
102
|
+
if (
|
|
103
|
+
not self.simulator_model
|
|
104
|
+
or not self.simulator_model.supports_multimodal()
|
|
105
|
+
):
|
|
106
|
+
if (
|
|
107
|
+
self.simulator_model
|
|
108
|
+
and type(self.simulator_model)
|
|
109
|
+
in MULTIMODAL_SUPPORTED_MODELS
|
|
110
|
+
):
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
|
|
117
|
+
)
|
|
97
118
|
conversational_test_cases: List[ConversationalTestCase] = []
|
|
98
119
|
for conversation_index, golden in enumerate(
|
|
99
120
|
conversational_goldens
|
|
@@ -124,6 +145,28 @@ class ConversationSimulator:
|
|
|
124
145
|
progress: Optional[Progress] = None,
|
|
125
146
|
pbar_id: Optional[int] = None,
|
|
126
147
|
) -> List[ConversationalTestCase]:
|
|
148
|
+
|
|
149
|
+
multimodal = any(
|
|
150
|
+
[golden.multimodal for golden in conversational_goldens]
|
|
151
|
+
)
|
|
152
|
+
if multimodal:
|
|
153
|
+
if (
|
|
154
|
+
not self.simulator_model
|
|
155
|
+
or not self.simulator_model.supports_multimodal()
|
|
156
|
+
):
|
|
157
|
+
if (
|
|
158
|
+
self.simulator_model
|
|
159
|
+
and type(self.simulator_model)
|
|
160
|
+
in MULTIMODAL_SUPPORTED_MODELS
|
|
161
|
+
):
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
|
|
168
|
+
)
|
|
169
|
+
|
|
127
170
|
self.simulation_cost = 0 if self.using_native_model else None
|
|
128
171
|
|
|
129
172
|
async def simulate_conversations(
|
deepeval/simulator/template.py
CHANGED
|
@@ -7,6 +7,13 @@ from deepeval.test_case import Turn
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ConversationSimulatorTemplate:
|
|
10
|
+
multimodal_rules = """
|
|
11
|
+
--- MULTIMODAL INPUT RULES ---
|
|
12
|
+
- Treat image content as factual evidence.
|
|
13
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
14
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
15
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
16
|
+
"""
|
|
10
17
|
|
|
11
18
|
@staticmethod
|
|
12
19
|
def simulate_first_user_turn(
|
|
@@ -23,6 +30,8 @@ class ConversationSimulatorTemplate:
|
|
|
23
30
|
3. Avoid providing excessive details upfront; the goal is to initiate the conversation and build rapport, not to solve it in the first message.
|
|
24
31
|
4. The message should be concise, ideally no more than 1-3 sentences.
|
|
25
32
|
|
|
33
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
34
|
+
|
|
26
35
|
IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`, where the value is the generated opening message in {language}.
|
|
27
36
|
|
|
28
37
|
Example Language: english
|
|
@@ -61,6 +70,8 @@ class ConversationSimulatorTemplate:
|
|
|
61
70
|
3. Keep the tone consistent with the previous user inputs.
|
|
62
71
|
4. The generated user input should be concise, ideally no more than 1-2 sentences.
|
|
63
72
|
|
|
73
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
74
|
+
|
|
64
75
|
IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`,
|
|
65
76
|
where the value is the generated user input in {language}.
|
|
66
77
|
|
|
@@ -101,6 +112,8 @@ class ConversationSimulatorTemplate:
|
|
|
101
112
|
2. If the expected outcome has been met, mark the conversation as complete.
|
|
102
113
|
3. If not, mark it as incomplete and briefly describe what remains to be done.
|
|
103
114
|
|
|
115
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
116
|
+
|
|
104
117
|
IMPORTANT: The output must be formatted as a JSON object with two keys:
|
|
105
118
|
`is_complete` (a boolean) and `reason` (a string).
|
|
106
119
|
|
deepeval/test_case/api.py
CHANGED
|
@@ -12,7 +12,6 @@ from deepeval.test_case import (
|
|
|
12
12
|
ConversationalTestCase,
|
|
13
13
|
Turn,
|
|
14
14
|
)
|
|
15
|
-
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
16
15
|
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
17
16
|
|
|
18
17
|
|
|
@@ -33,7 +32,6 @@ def create_api_test_case(
|
|
|
33
32
|
trace: Optional[TraceApi] = None,
|
|
34
33
|
index: Optional[int] = None,
|
|
35
34
|
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
36
|
-
from deepeval.utils import convert_to_multi_modal_array
|
|
37
35
|
|
|
38
36
|
if isinstance(test_case, ConversationalTestCase):
|
|
39
37
|
order = (
|
|
@@ -61,8 +59,10 @@ def create_api_test_case(
|
|
|
61
59
|
context=test_case.context,
|
|
62
60
|
tags=test_case.tags,
|
|
63
61
|
comments=test_case.comments,
|
|
62
|
+
imagesMapping=test_case._get_images_mapping(),
|
|
64
63
|
additionalMetadata=test_case.additional_metadata,
|
|
65
64
|
)
|
|
65
|
+
|
|
66
66
|
api_test_case.turns = [
|
|
67
67
|
create_api_turn(
|
|
68
68
|
turn=turn,
|
|
@@ -86,48 +86,27 @@ def create_api_test_case(
|
|
|
86
86
|
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
87
87
|
metrics_data = []
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
112
|
-
api_test_case = LLMApiTestCase(
|
|
113
|
-
name=name,
|
|
114
|
-
input=test_case.input,
|
|
115
|
-
actualOutput=test_case.actual_output,
|
|
116
|
-
expectedOutput=test_case.expected_output,
|
|
117
|
-
retrievalContext=test_case.retrieval_context,
|
|
118
|
-
context=test_case.context,
|
|
119
|
-
imagesMapping=_MLLM_IMAGE_REGISTRY,
|
|
120
|
-
toolsCalled=test_case.tools_called,
|
|
121
|
-
expectedTools=test_case.expected_tools,
|
|
122
|
-
tokenCost=test_case.token_cost,
|
|
123
|
-
completionTime=test_case.completion_time,
|
|
124
|
-
success=success,
|
|
125
|
-
metricsData=metrics_data,
|
|
126
|
-
runDuration=None,
|
|
127
|
-
evaluationCost=None,
|
|
128
|
-
order=order,
|
|
129
|
-
additionalMetadata=test_case.additional_metadata,
|
|
130
|
-
comments=test_case.comments,
|
|
131
|
-
)
|
|
89
|
+
api_test_case = LLMApiTestCase(
|
|
90
|
+
name=name,
|
|
91
|
+
input=test_case.input,
|
|
92
|
+
actualOutput=test_case.actual_output,
|
|
93
|
+
expectedOutput=test_case.expected_output,
|
|
94
|
+
retrievalContext=test_case.retrieval_context,
|
|
95
|
+
context=test_case.context,
|
|
96
|
+
imagesMapping=test_case._get_images_mapping(),
|
|
97
|
+
toolsCalled=test_case.tools_called,
|
|
98
|
+
expectedTools=test_case.expected_tools,
|
|
99
|
+
tokenCost=test_case.token_cost,
|
|
100
|
+
completionTime=test_case.completion_time,
|
|
101
|
+
success=success,
|
|
102
|
+
metricsData=metrics_data,
|
|
103
|
+
runDuration=None,
|
|
104
|
+
evaluationCost=None,
|
|
105
|
+
order=order,
|
|
106
|
+
additionalMetadata=test_case.additional_metadata,
|
|
107
|
+
comments=test_case.comments,
|
|
108
|
+
tags=test_case.tags,
|
|
109
|
+
trace=trace,
|
|
110
|
+
)
|
|
132
111
|
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
133
112
|
return api_test_case
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Dict, Optional, Union
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
3
|
from pydantic import BaseModel
|
|
4
|
-
|
|
4
|
+
import re
|
|
5
5
|
from deepeval.test_case import (
|
|
6
6
|
LLMTestCase,
|
|
7
7
|
)
|
|
@@ -19,6 +19,7 @@ class Contestant(BaseModel):
|
|
|
19
19
|
@dataclass
|
|
20
20
|
class ArenaTestCase:
|
|
21
21
|
contestants: List[Contestant]
|
|
22
|
+
multimodal: bool = field(default=False)
|
|
22
23
|
|
|
23
24
|
def __post_init__(self):
|
|
24
25
|
contestant_names = [contestant.name for contestant in self.contestants]
|
|
@@ -38,6 +39,10 @@ class ArenaTestCase:
|
|
|
38
39
|
"All contestants must have the same 'expected_output'."
|
|
39
40
|
)
|
|
40
41
|
|
|
42
|
+
for contestant in self.contestants:
|
|
43
|
+
if contestant.test_case.multimodal:
|
|
44
|
+
self.multimodal = True
|
|
45
|
+
|
|
41
46
|
|
|
42
47
|
class Arena:
|
|
43
48
|
test_cases: List[ArenaTestCase]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from pydantic import (
|
|
2
3
|
BaseModel,
|
|
3
4
|
Field,
|
|
@@ -17,6 +18,7 @@ from deepeval.test_case.mcp import (
|
|
|
17
18
|
MCPToolCall,
|
|
18
19
|
validate_mcp_servers,
|
|
19
20
|
)
|
|
21
|
+
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class TurnParams(Enum):
|
|
@@ -170,12 +172,28 @@ class ConversationalTestCase(BaseModel):
|
|
|
170
172
|
return self
|
|
171
173
|
|
|
172
174
|
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
173
|
-
self.
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
175
|
+
if self.scenario:
|
|
176
|
+
if re.search(pattern, self.scenario) is not None:
|
|
177
|
+
self.multimodal = True
|
|
178
|
+
return self
|
|
179
|
+
if self.expected_outcome:
|
|
180
|
+
if re.search(pattern, self.expected_outcome) is not None:
|
|
181
|
+
self.multimodal = True
|
|
182
|
+
return self
|
|
183
|
+
if self.user_description:
|
|
184
|
+
if re.search(pattern, self.user_description) is not None:
|
|
185
|
+
self.multimodal = True
|
|
186
|
+
return self
|
|
187
|
+
if self.turns:
|
|
188
|
+
for turn in self.turns:
|
|
189
|
+
if re.search(pattern, turn.content) is not None:
|
|
190
|
+
self.multimodal = True
|
|
191
|
+
return self
|
|
192
|
+
if turn.retrieval_context is not None:
|
|
193
|
+
self.multimodal = any(
|
|
194
|
+
re.search(pattern, context) is not None
|
|
195
|
+
for context in turn.retrieval_context
|
|
196
|
+
)
|
|
179
197
|
|
|
180
198
|
return self
|
|
181
199
|
|
|
@@ -215,3 +233,34 @@ class ConversationalTestCase(BaseModel):
|
|
|
215
233
|
data["turns"] = copied_turns
|
|
216
234
|
|
|
217
235
|
return data
|
|
236
|
+
|
|
237
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
238
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
239
|
+
image_ids = set()
|
|
240
|
+
|
|
241
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
242
|
+
"""Helper to extract image IDs from a string."""
|
|
243
|
+
if s is not None and isinstance(s, str):
|
|
244
|
+
matches = re.findall(pattern, s)
|
|
245
|
+
image_ids.update(matches)
|
|
246
|
+
|
|
247
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
248
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
249
|
+
if lst is not None:
|
|
250
|
+
for item in lst:
|
|
251
|
+
extract_ids_from_string(item)
|
|
252
|
+
|
|
253
|
+
extract_ids_from_string(self.scenario)
|
|
254
|
+
extract_ids_from_string(self.expected_outcome)
|
|
255
|
+
extract_ids_from_list(self.context)
|
|
256
|
+
extract_ids_from_string(self.user_description)
|
|
257
|
+
for turn in self.turns:
|
|
258
|
+
extract_ids_from_string(turn.content)
|
|
259
|
+
extract_ids_from_list(turn.retrieval_context)
|
|
260
|
+
|
|
261
|
+
images_mapping = {}
|
|
262
|
+
for img_id in image_ids:
|
|
263
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
264
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
265
|
+
|
|
266
|
+
return images_mapping if len(images_mapping) > 0 else None
|
|
@@ -60,19 +60,34 @@ class MLLMImage:
|
|
|
60
60
|
if self.local:
|
|
61
61
|
path = self.process_url(self.url)
|
|
62
62
|
self.filename = os.path.basename(path)
|
|
63
|
-
self.mimeType = (
|
|
64
|
-
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
self.
|
|
63
|
+
self.mimeType = mimetypes.guess_type(path)[0] or "image/jpeg"
|
|
64
|
+
|
|
65
|
+
if not os.path.exists(path):
|
|
66
|
+
raise FileNotFoundError(f"Image file not found: {path}")
|
|
67
|
+
|
|
68
|
+
self._load_base64(path)
|
|
69
69
|
else:
|
|
70
|
+
if not self.url.startswith(("http://", "https://")):
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"Invalid remote URL format: {self.url}. URL must start with http:// or https://"
|
|
73
|
+
)
|
|
70
74
|
self.filename = None
|
|
71
75
|
self.mimeType = None
|
|
72
76
|
self.dataBase64 = None
|
|
73
77
|
|
|
74
78
|
_MLLM_IMAGE_REGISTRY[self._id] = self
|
|
75
79
|
|
|
80
|
+
def _load_base64(self, path: str):
|
|
81
|
+
with open(path, "rb") as f:
|
|
82
|
+
raw = f.read()
|
|
83
|
+
self.dataBase64 = base64.b64encode(raw).decode("ascii")
|
|
84
|
+
|
|
85
|
+
def ensure_images_loaded(self):
|
|
86
|
+
if self.local and self.dataBase64 is None:
|
|
87
|
+
path = self.process_url(self.url)
|
|
88
|
+
self._load_base64(path)
|
|
89
|
+
return self
|
|
90
|
+
|
|
76
91
|
def _placeholder(self) -> str:
|
|
77
92
|
return f"[DEEPEVAL:IMAGE:{self._id}]"
|
|
78
93
|
|
|
@@ -376,6 +391,16 @@ class LLMTestCase(BaseModel):
|
|
|
376
391
|
if isinstance(self.input, str)
|
|
377
392
|
else self.multimodal
|
|
378
393
|
)
|
|
394
|
+
if self.retrieval_context is not None:
|
|
395
|
+
auto_detect = auto_detect or any(
|
|
396
|
+
re.search(pattern, context) is not None
|
|
397
|
+
for context in self.retrieval_context
|
|
398
|
+
)
|
|
399
|
+
if self.context is not None:
|
|
400
|
+
auto_detect = auto_detect or any(
|
|
401
|
+
re.search(pattern, context) is not None
|
|
402
|
+
for context in self.context
|
|
403
|
+
)
|
|
379
404
|
|
|
380
405
|
self.multimodal = auto_detect
|
|
381
406
|
return self
|
|
@@ -486,3 +511,32 @@ class LLMTestCase(BaseModel):
|
|
|
486
511
|
)
|
|
487
512
|
|
|
488
513
|
return data
|
|
514
|
+
|
|
515
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
516
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
517
|
+
image_ids = set()
|
|
518
|
+
|
|
519
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
520
|
+
"""Helper to extract image IDs from a string."""
|
|
521
|
+
if s is not None and isinstance(s, str):
|
|
522
|
+
matches = re.findall(pattern, s)
|
|
523
|
+
image_ids.update(matches)
|
|
524
|
+
|
|
525
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
526
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
527
|
+
if lst is not None:
|
|
528
|
+
for item in lst:
|
|
529
|
+
extract_ids_from_string(item)
|
|
530
|
+
|
|
531
|
+
extract_ids_from_string(self.input)
|
|
532
|
+
extract_ids_from_string(self.actual_output)
|
|
533
|
+
extract_ids_from_string(self.expected_output)
|
|
534
|
+
extract_ids_from_list(self.context)
|
|
535
|
+
extract_ids_from_list(self.retrieval_context)
|
|
536
|
+
|
|
537
|
+
images_mapping = {}
|
|
538
|
+
for img_id in image_ids:
|
|
539
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
540
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
541
|
+
|
|
542
|
+
return images_mapping if len(images_mapping) > 0 else None
|
deepeval/test_run/api.py
CHANGED
|
@@ -126,6 +126,9 @@ class ConversationalApiTestCase(BaseModel):
|
|
|
126
126
|
additional_metadata: Optional[Dict] = Field(
|
|
127
127
|
None, alias="additionalMetadata"
|
|
128
128
|
)
|
|
129
|
+
images_mapping: Optional[Dict[str, MLLMImage]] = Field(
|
|
130
|
+
None, alias="imagesMapping"
|
|
131
|
+
)
|
|
129
132
|
tags: Optional[List[str]] = Field(None)
|
|
130
133
|
|
|
131
134
|
def update_metric_data(self, metrics_data: MetricData):
|