deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.7.
|
|
1
|
+
__version__: str = "3.7.5"
|
deepeval/cli/test.py
CHANGED
|
@@ -160,7 +160,7 @@ def run(
|
|
|
160
160
|
pytest_args.extend(["--identifier", identifier])
|
|
161
161
|
|
|
162
162
|
# Add the deepeval plugin file to pytest arguments
|
|
163
|
-
pytest_args.extend(["-p", "
|
|
163
|
+
pytest_args.extend(["-p", "deepeval"])
|
|
164
164
|
# Append the extra arguments collected by allow_extra_args=True
|
|
165
165
|
# Pytest will raise its own error if the arguments are invalid (error:
|
|
166
166
|
if ctx.args:
|
deepeval/config/settings.py
CHANGED
|
@@ -49,6 +49,8 @@ _DEPRECATED_TO_OVERRIDE = {
|
|
|
49
49
|
"DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
|
|
50
50
|
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
|
|
51
51
|
}
|
|
52
|
+
# Track which secrets we've warned about when loading from the legacy keyfile
|
|
53
|
+
_LEGACY_KEYFILE_SECRET_WARNED: set[str] = set()
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
def _find_legacy_enum(env_key: str):
|
|
@@ -88,6 +90,82 @@ def _is_secret_key(settings: "Settings", env_key: str) -> bool:
|
|
|
88
90
|
return False
|
|
89
91
|
|
|
90
92
|
|
|
93
|
+
def _merge_legacy_keyfile_into_env() -> None:
|
|
94
|
+
"""
|
|
95
|
+
Backwards compatibility: merge values from the legacy .deepeval/.deepeval
|
|
96
|
+
JSON keystore into os.environ for known Settings fields, without
|
|
97
|
+
overwriting existing process env vars.
|
|
98
|
+
|
|
99
|
+
This runs before we compute the Settings env fingerprint so that Pydantic
|
|
100
|
+
can see these values on first construction.
|
|
101
|
+
|
|
102
|
+
Precedence: process env -> dotenv -> legacy json
|
|
103
|
+
"""
|
|
104
|
+
# if somebody really wants to skip this behavior
|
|
105
|
+
if parse_bool(os.getenv("DEEPEVAL_DISABLE_LEGACY_KEYFILE"), default=False):
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
from deepeval.constants import HIDDEN_DIR, KEY_FILE
|
|
109
|
+
from deepeval.key_handler import (
|
|
110
|
+
KeyValues,
|
|
111
|
+
ModelKeyValues,
|
|
112
|
+
EmbeddingKeyValues,
|
|
113
|
+
SECRET_KEYS,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
key_path = Path(HIDDEN_DIR) / KEY_FILE
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
with key_path.open("r", encoding="utf-8") as f:
|
|
120
|
+
try:
|
|
121
|
+
data = json.load(f)
|
|
122
|
+
except json.JSONDecodeError:
|
|
123
|
+
# Corrupted file -> ignore, same as KeyFileHandler
|
|
124
|
+
return
|
|
125
|
+
except FileNotFoundError:
|
|
126
|
+
# No legacy store -> nothing to merge
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
if not isinstance(data, dict):
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
# Map JSON keys (enum .value) -> env keys (enum .name)
|
|
133
|
+
mapping: Dict[str, str] = {}
|
|
134
|
+
for enum in (KeyValues, ModelKeyValues, EmbeddingKeyValues):
|
|
135
|
+
for member in enum:
|
|
136
|
+
mapping[member.value] = member.name
|
|
137
|
+
|
|
138
|
+
for json_key, raw in data.items():
|
|
139
|
+
env_key = mapping.get(json_key)
|
|
140
|
+
if not env_key:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
# Process env always wins
|
|
144
|
+
if env_key in os.environ:
|
|
145
|
+
continue
|
|
146
|
+
if raw is None:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
# Mirror the legacy warning semantics for secrets, but only once per key
|
|
150
|
+
if (
|
|
151
|
+
json_key in SECRET_KEYS
|
|
152
|
+
and json_key not in _LEGACY_KEYFILE_SECRET_WARNED
|
|
153
|
+
):
|
|
154
|
+
logger.warning(
|
|
155
|
+
"Reading secret '%s' from legacy %s/%s. "
|
|
156
|
+
"Persisting API keys in plaintext is deprecated. "
|
|
157
|
+
"Move this to your environment (.env / .env.local). "
|
|
158
|
+
"This fallback will be removed in a future release.",
|
|
159
|
+
json_key,
|
|
160
|
+
HIDDEN_DIR,
|
|
161
|
+
KEY_FILE,
|
|
162
|
+
)
|
|
163
|
+
_LEGACY_KEYFILE_SECRET_WARNED.add(json_key)
|
|
164
|
+
|
|
165
|
+
# Let Settings validators coerce types; we just inject the raw string
|
|
166
|
+
os.environ[env_key] = str(raw)
|
|
167
|
+
|
|
168
|
+
|
|
91
169
|
def _read_env_file(path: Path) -> Dict[str, str]:
|
|
92
170
|
if not path.exists():
|
|
93
171
|
return {}
|
|
@@ -258,6 +336,7 @@ class Settings(BaseSettings):
|
|
|
258
336
|
GOOGLE_GENAI_USE_VERTEXAI: Optional[bool] = None
|
|
259
337
|
GOOGLE_CLOUD_PROJECT: Optional[str] = None
|
|
260
338
|
GOOGLE_CLOUD_LOCATION: Optional[str] = None
|
|
339
|
+
GOOGLE_SERVICE_ACCOUNT_KEY: Optional[str] = None
|
|
261
340
|
# Grok
|
|
262
341
|
USE_GROK_MODEL: Optional[bool] = None
|
|
263
342
|
GROK_API_KEY: Optional[SecretStr] = None
|
|
@@ -291,6 +370,12 @@ class Settings(BaseSettings):
|
|
|
291
370
|
OPENAI_MODEL_NAME: Optional[str] = None
|
|
292
371
|
OPENAI_COST_PER_INPUT_TOKEN: Optional[float] = None
|
|
293
372
|
OPENAI_COST_PER_OUTPUT_TOKEN: Optional[float] = None
|
|
373
|
+
# PortKey
|
|
374
|
+
USE_PORTKEY_MODEL: Optional[bool] = None
|
|
375
|
+
PORTKEY_API_KEY: Optional[SecretStr] = None
|
|
376
|
+
PORTKEY_MODEL_NAME: Optional[str] = None
|
|
377
|
+
PORTKEY_BASE_URL: Optional[AnyUrl] = None
|
|
378
|
+
PORTKEY_PROVIDER_NAME: Optional[str] = None
|
|
294
379
|
# Vertex AI
|
|
295
380
|
VERTEX_AI_MODEL_NAME: Optional[str] = None
|
|
296
381
|
# VLLM
|
|
@@ -516,29 +601,30 @@ class Settings(BaseSettings):
|
|
|
516
601
|
"CONFIDENT_OPEN_BROWSER",
|
|
517
602
|
"CONFIDENT_TRACE_FLUSH",
|
|
518
603
|
"CONFIDENT_TRACE_VERBOSE",
|
|
604
|
+
"CUDA_LAUNCH_BLOCKING",
|
|
605
|
+
"DEEPEVAL_VERBOSE_MODE",
|
|
606
|
+
"DEEPEVAL_GRPC_LOGGING",
|
|
607
|
+
"DEEPEVAL_DISABLE_DOTENV",
|
|
608
|
+
"DEEPEVAL_TELEMETRY_OPT_OUT",
|
|
609
|
+
"DEEPEVAL_UPDATE_WARNING_OPT_IN",
|
|
610
|
+
"ENABLE_DEEPEVAL_CACHE",
|
|
611
|
+
"ERROR_REPORTING",
|
|
612
|
+
"GOOGLE_GENAI_USE_VERTEXAI",
|
|
613
|
+
"IGNORE_DEEPEVAL_ERRORS",
|
|
614
|
+
"SKIP_DEEPEVAL_MISSING_PARAMS",
|
|
615
|
+
"TOKENIZERS_PARALLELISM",
|
|
616
|
+
"TRANSFORMERS_NO_ADVISORY_WARNINGS",
|
|
519
617
|
"USE_OPENAI_MODEL",
|
|
520
618
|
"USE_AZURE_OPENAI",
|
|
521
619
|
"USE_LOCAL_MODEL",
|
|
522
620
|
"USE_GEMINI_MODEL",
|
|
523
|
-
"GOOGLE_GENAI_USE_VERTEXAI",
|
|
524
621
|
"USE_MOONSHOT_MODEL",
|
|
525
622
|
"USE_GROK_MODEL",
|
|
526
623
|
"USE_DEEPSEEK_MODEL",
|
|
527
624
|
"USE_LITELLM",
|
|
528
625
|
"USE_AZURE_OPENAI_EMBEDDING",
|
|
529
626
|
"USE_LOCAL_EMBEDDINGS",
|
|
530
|
-
"
|
|
531
|
-
"DEEPEVAL_DISABLE_DOTENV",
|
|
532
|
-
"DEEPEVAL_TELEMETRY_OPT_OUT",
|
|
533
|
-
"DEEPEVAL_UPDATE_WARNING_OPT_IN",
|
|
534
|
-
"TOKENIZERS_PARALLELISM",
|
|
535
|
-
"TRANSFORMERS_NO_ADVISORY_WARNINGS",
|
|
536
|
-
"CUDA_LAUNCH_BLOCKING",
|
|
537
|
-
"ERROR_REPORTING",
|
|
538
|
-
"IGNORE_DEEPEVAL_ERRORS",
|
|
539
|
-
"SKIP_DEEPEVAL_MISSING_PARAMS",
|
|
540
|
-
"DEEPEVAL_VERBOSE_MODE",
|
|
541
|
-
"ENABLE_DEEPEVAL_CACHE",
|
|
627
|
+
"USE_PORTKEY_MODEL",
|
|
542
628
|
mode="before",
|
|
543
629
|
)
|
|
544
630
|
@classmethod
|
|
@@ -1008,6 +1094,9 @@ _settings_lock = threading.RLock()
|
|
|
1008
1094
|
|
|
1009
1095
|
|
|
1010
1096
|
def _calc_env_fingerprint() -> str:
|
|
1097
|
+
# Pull legacy .deepeval JSON-based settings into the process env before hashing
|
|
1098
|
+
_merge_legacy_keyfile_into_env()
|
|
1099
|
+
|
|
1011
1100
|
env = os.environ.copy()
|
|
1012
1101
|
# must hash in a stable order.
|
|
1013
1102
|
keys = sorted(
|
deepeval/dataset/golden.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field, PrivateAttr
|
|
1
|
+
from pydantic import BaseModel, Field, PrivateAttr, model_validator
|
|
2
2
|
from typing import Optional, Dict, List
|
|
3
|
-
from deepeval.test_case import ToolCall, Turn
|
|
3
|
+
from deepeval.test_case import ToolCall, Turn, MLLMImage
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class Golden(BaseModel):
|
|
@@ -32,10 +32,40 @@ class Golden(BaseModel):
|
|
|
32
32
|
custom_column_key_values: Optional[Dict[str, str]] = Field(
|
|
33
33
|
default=None, serialization_alias="customColumnKeyValues"
|
|
34
34
|
)
|
|
35
|
+
multimodal: bool = Field(False, exclude=True)
|
|
35
36
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
36
37
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
37
38
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
38
39
|
|
|
40
|
+
@model_validator(mode="after")
|
|
41
|
+
def set_is_multimodal(self):
|
|
42
|
+
import re
|
|
43
|
+
|
|
44
|
+
if self.multimodal is True:
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
48
|
+
self.multimodal = (
|
|
49
|
+
any(
|
|
50
|
+
[
|
|
51
|
+
(
|
|
52
|
+
re.search(pattern, self.input) is not None
|
|
53
|
+
if self.input
|
|
54
|
+
else False
|
|
55
|
+
),
|
|
56
|
+
(
|
|
57
|
+
re.search(pattern, self.actual_output) is not None
|
|
58
|
+
if self.actual_output
|
|
59
|
+
else False
|
|
60
|
+
),
|
|
61
|
+
]
|
|
62
|
+
)
|
|
63
|
+
if isinstance(self.input, str)
|
|
64
|
+
else self.multimodal
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return self
|
|
68
|
+
|
|
39
69
|
|
|
40
70
|
class ConversationalGolden(BaseModel):
|
|
41
71
|
scenario: str
|
|
@@ -55,6 +85,28 @@ class ConversationalGolden(BaseModel):
|
|
|
55
85
|
default=None, serialization_alias="customColumnKeyValues"
|
|
56
86
|
)
|
|
57
87
|
turns: Optional[List[Turn]] = Field(default=None)
|
|
88
|
+
multimodal: bool = Field(False, exclude=True)
|
|
58
89
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
59
90
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
60
91
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
92
|
+
|
|
93
|
+
@model_validator(mode="after")
|
|
94
|
+
def set_is_multimodal(self):
|
|
95
|
+
import re
|
|
96
|
+
|
|
97
|
+
if self.multimodal is True:
|
|
98
|
+
return self
|
|
99
|
+
|
|
100
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
101
|
+
self.multimodal = (
|
|
102
|
+
any(
|
|
103
|
+
[
|
|
104
|
+
re.search(pattern, turn.content) is not None
|
|
105
|
+
for turn in self.turns
|
|
106
|
+
]
|
|
107
|
+
)
|
|
108
|
+
if self.turns
|
|
109
|
+
else self.multimodal
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return self
|
deepeval/evaluate/configs.py
CHANGED
deepeval/evaluate/evaluate.py
CHANGED
|
@@ -54,7 +54,6 @@ from deepeval.metrics.indicator import (
|
|
|
54
54
|
from deepeval.test_case import (
|
|
55
55
|
LLMTestCase,
|
|
56
56
|
ConversationalTestCase,
|
|
57
|
-
MLLMTestCase,
|
|
58
57
|
)
|
|
59
58
|
from deepeval.test_run import (
|
|
60
59
|
global_test_run_manager,
|
|
@@ -71,9 +70,7 @@ from deepeval.evaluate.execute import (
|
|
|
71
70
|
|
|
72
71
|
|
|
73
72
|
def assert_test(
|
|
74
|
-
test_case: Optional[
|
|
75
|
-
Union[LLMTestCase, ConversationalTestCase, MLLMTestCase]
|
|
76
|
-
] = None,
|
|
73
|
+
test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
|
|
77
74
|
metrics: Optional[
|
|
78
75
|
Union[
|
|
79
76
|
List[BaseMetric],
|
|
@@ -175,7 +172,7 @@ def assert_test(
|
|
|
175
172
|
try:
|
|
176
173
|
if not metric_data.success:
|
|
177
174
|
failed_metrics_data.append(metric_data)
|
|
178
|
-
except:
|
|
175
|
+
except Exception:
|
|
179
176
|
failed_metrics_data.append(metric_data)
|
|
180
177
|
|
|
181
178
|
failed_metrics_str = ", ".join(
|
|
@@ -188,9 +185,7 @@ def assert_test(
|
|
|
188
185
|
|
|
189
186
|
|
|
190
187
|
def evaluate(
|
|
191
|
-
test_cases: Union[
|
|
192
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
193
|
-
],
|
|
188
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
194
189
|
metrics: Optional[
|
|
195
190
|
Union[
|
|
196
191
|
List[BaseMetric],
|
|
@@ -272,6 +267,19 @@ def evaluate(
|
|
|
272
267
|
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
273
268
|
test_run.prompts = process_prompts(hyperparameters)
|
|
274
269
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
270
|
+
|
|
271
|
+
# In CLI mode (`deepeval test run`), the CLI owns finalization and will
|
|
272
|
+
# call `wrap_up_test_run()` once after pytest finishes. Finalizing here
|
|
273
|
+
# as well would double finalize the run and consequently result in
|
|
274
|
+
# duplicate uploads / local saves and temp file races, so only
|
|
275
|
+
# do it when we're NOT in CLI mode.
|
|
276
|
+
if get_is_running_deepeval():
|
|
277
|
+
return EvaluationResult(
|
|
278
|
+
test_results=test_results,
|
|
279
|
+
confident_link=None,
|
|
280
|
+
test_run_id=None,
|
|
281
|
+
)
|
|
282
|
+
|
|
275
283
|
res = global_test_run_manager.wrap_up_test_run(
|
|
276
284
|
run_duration, display_table=False
|
|
277
285
|
)
|
deepeval/evaluate/execute.py
CHANGED
|
@@ -58,6 +58,13 @@ from deepeval.metrics import (
|
|
|
58
58
|
BaseConversationalMetric,
|
|
59
59
|
BaseMultimodalMetric,
|
|
60
60
|
TaskCompletionMetric,
|
|
61
|
+
# RAG metrics that support both single-turn and multimodal
|
|
62
|
+
ContextualPrecisionMetric,
|
|
63
|
+
ContextualRecallMetric,
|
|
64
|
+
ContextualRelevancyMetric,
|
|
65
|
+
AnswerRelevancyMetric,
|
|
66
|
+
FaithfulnessMetric,
|
|
67
|
+
ToolCorrectnessMetric,
|
|
61
68
|
)
|
|
62
69
|
from deepeval.metrics.indicator import (
|
|
63
70
|
measure_metrics_with_indicator,
|
|
@@ -70,7 +77,6 @@ from deepeval.models.retry_policy import (
|
|
|
70
77
|
from deepeval.test_case import (
|
|
71
78
|
LLMTestCase,
|
|
72
79
|
ConversationalTestCase,
|
|
73
|
-
MLLMTestCase,
|
|
74
80
|
)
|
|
75
81
|
from deepeval.test_case.api import create_api_test_case
|
|
76
82
|
from deepeval.test_run import (
|
|
@@ -110,6 +116,15 @@ from deepeval.test_run.hyperparameters import (
|
|
|
110
116
|
|
|
111
117
|
logger = logging.getLogger(__name__)
|
|
112
118
|
|
|
119
|
+
MLLM_SUPPORTED_METRICS = [
|
|
120
|
+
ContextualPrecisionMetric,
|
|
121
|
+
ContextualRecallMetric,
|
|
122
|
+
ContextualRelevancyMetric,
|
|
123
|
+
AnswerRelevancyMetric,
|
|
124
|
+
FaithfulnessMetric,
|
|
125
|
+
ToolCorrectnessMetric,
|
|
126
|
+
]
|
|
127
|
+
|
|
113
128
|
|
|
114
129
|
def _skip_metrics_for_error(
|
|
115
130
|
span: Optional[BaseSpan] = None,
|
|
@@ -263,9 +278,7 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
|
|
|
263
278
|
|
|
264
279
|
|
|
265
280
|
def execute_test_cases(
|
|
266
|
-
test_cases: Union[
|
|
267
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
268
|
-
],
|
|
281
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
269
282
|
metrics: Union[
|
|
270
283
|
List[BaseMetric],
|
|
271
284
|
List[BaseConversationalMetric],
|
|
@@ -307,6 +320,8 @@ def execute_test_cases(
|
|
|
307
320
|
metric.async_mode = False
|
|
308
321
|
if isinstance(metric, BaseMetric):
|
|
309
322
|
llm_metrics.append(metric)
|
|
323
|
+
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
324
|
+
mllm_metrics.append(metric)
|
|
310
325
|
elif isinstance(metric, BaseConversationalMetric):
|
|
311
326
|
conversational_metrics.append(metric)
|
|
312
327
|
elif isinstance(metric, BaseMultimodalMetric):
|
|
@@ -325,12 +340,12 @@ def execute_test_cases(
|
|
|
325
340
|
)
|
|
326
341
|
for i, test_case in enumerate(test_cases):
|
|
327
342
|
# skip what we know we won't run
|
|
328
|
-
if isinstance(test_case, LLMTestCase):
|
|
343
|
+
if isinstance(test_case, LLMTestCase) and not test_case.multimodal:
|
|
329
344
|
if not llm_metrics:
|
|
330
345
|
update_pbar(progress, pbar_id)
|
|
331
346
|
continue
|
|
332
347
|
per_case_total = len(llm_metrics)
|
|
333
|
-
elif isinstance(test_case,
|
|
348
|
+
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
334
349
|
if not mllm_metrics:
|
|
335
350
|
update_pbar(progress, pbar_id)
|
|
336
351
|
continue
|
|
@@ -349,10 +364,16 @@ def execute_test_cases(
|
|
|
349
364
|
|
|
350
365
|
metrics_for_case = (
|
|
351
366
|
llm_metrics
|
|
352
|
-
if
|
|
367
|
+
if (
|
|
368
|
+
isinstance(test_case, LLMTestCase)
|
|
369
|
+
and not test_case.multimodal
|
|
370
|
+
)
|
|
353
371
|
else (
|
|
354
372
|
mllm_metrics
|
|
355
|
-
if
|
|
373
|
+
if (
|
|
374
|
+
isinstance(test_case, LLMTestCase)
|
|
375
|
+
and test_case.multimodal
|
|
376
|
+
)
|
|
356
377
|
else conversational_metrics
|
|
357
378
|
)
|
|
358
379
|
)
|
|
@@ -360,10 +381,16 @@ def execute_test_cases(
|
|
|
360
381
|
test_case=test_case,
|
|
361
382
|
index=(
|
|
362
383
|
llm_test_case_count + 1
|
|
363
|
-
if
|
|
384
|
+
if (
|
|
385
|
+
isinstance(test_case, LLMTestCase)
|
|
386
|
+
and not test_case.multimodal
|
|
387
|
+
)
|
|
364
388
|
else (
|
|
365
389
|
mllm_test_case_count + 1
|
|
366
|
-
if
|
|
390
|
+
if (
|
|
391
|
+
isinstance(test_case, LLMTestCase)
|
|
392
|
+
and test_case.multimodal
|
|
393
|
+
)
|
|
367
394
|
else conversational_test_case_count + 1
|
|
368
395
|
)
|
|
369
396
|
),
|
|
@@ -383,7 +410,10 @@ def execute_test_cases(
|
|
|
383
410
|
for metric in metrics:
|
|
384
411
|
metric.error = None # Reset metric error
|
|
385
412
|
|
|
386
|
-
if
|
|
413
|
+
if (
|
|
414
|
+
isinstance(test_case, LLMTestCase)
|
|
415
|
+
and not test_case.multimodal
|
|
416
|
+
):
|
|
387
417
|
llm_test_case_count += 1
|
|
388
418
|
cached_test_case = None
|
|
389
419
|
if cache_config.use_cache:
|
|
@@ -436,7 +466,10 @@ def execute_test_cases(
|
|
|
436
466
|
update_pbar(progress, pbar_test_case_id)
|
|
437
467
|
|
|
438
468
|
# No caching and not sending test cases to Confident AI for multimodal metrics yet
|
|
439
|
-
elif
|
|
469
|
+
elif (
|
|
470
|
+
isinstance(test_case, LLMTestCase)
|
|
471
|
+
and test_case.multimodal
|
|
472
|
+
):
|
|
440
473
|
mllm_test_case_count += 1
|
|
441
474
|
for metric in mllm_metrics:
|
|
442
475
|
current_index = index_of[id(metric)]
|
|
@@ -560,9 +593,7 @@ def execute_test_cases(
|
|
|
560
593
|
|
|
561
594
|
|
|
562
595
|
async def a_execute_test_cases(
|
|
563
|
-
test_cases: Union[
|
|
564
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
565
|
-
],
|
|
596
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
566
597
|
metrics: Union[
|
|
567
598
|
List[BaseMetric],
|
|
568
599
|
List[BaseConversationalMetric],
|
|
@@ -605,6 +636,8 @@ async def a_execute_test_cases(
|
|
|
605
636
|
for metric in metrics:
|
|
606
637
|
if isinstance(metric, BaseMetric):
|
|
607
638
|
llm_metrics.append(metric)
|
|
639
|
+
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
640
|
+
mllm_metrics.append(metric)
|
|
608
641
|
elif isinstance(metric, BaseMultimodalMetric):
|
|
609
642
|
mllm_metrics.append(metric)
|
|
610
643
|
elif isinstance(metric, BaseConversationalMetric):
|
|
@@ -613,7 +646,7 @@ async def a_execute_test_cases(
|
|
|
613
646
|
llm_test_case_counter = -1
|
|
614
647
|
mllm_test_case_counter = -1
|
|
615
648
|
conversational_test_case_counter = -1
|
|
616
|
-
test_results: List[Union[TestResult,
|
|
649
|
+
test_results: List[Union[TestResult, LLMTestCase]] = []
|
|
617
650
|
tasks = []
|
|
618
651
|
|
|
619
652
|
if display_config.show_indicator and _use_bar_indicator:
|
|
@@ -632,7 +665,10 @@ async def a_execute_test_cases(
|
|
|
632
665
|
with progress:
|
|
633
666
|
for test_case in test_cases:
|
|
634
667
|
with capture_evaluation_run("test case"):
|
|
635
|
-
if
|
|
668
|
+
if (
|
|
669
|
+
isinstance(test_case, LLMTestCase)
|
|
670
|
+
and not test_case.multimodal
|
|
671
|
+
):
|
|
636
672
|
if len(llm_metrics) == 0:
|
|
637
673
|
update_pbar(progress, pbar_id)
|
|
638
674
|
continue
|
|
@@ -660,7 +696,10 @@ async def a_execute_test_cases(
|
|
|
660
696
|
)
|
|
661
697
|
tasks.append(asyncio.create_task(task))
|
|
662
698
|
|
|
663
|
-
elif
|
|
699
|
+
elif (
|
|
700
|
+
isinstance(test_case, LLMTestCase)
|
|
701
|
+
and test_case.multimodal
|
|
702
|
+
):
|
|
664
703
|
mllm_test_case_counter += 1
|
|
665
704
|
copied_multimodal_metrics: List[
|
|
666
705
|
BaseMultimodalMetric
|
|
@@ -718,11 +757,16 @@ async def a_execute_test_cases(
|
|
|
718
757
|
"Gather timed out after %.1fs. Some metrics may be marked as timed out.",
|
|
719
758
|
_gather_timeout(),
|
|
720
759
|
)
|
|
760
|
+
if not error_config.ignore_errors:
|
|
761
|
+
raise
|
|
721
762
|
|
|
722
763
|
else:
|
|
723
764
|
for test_case in test_cases:
|
|
724
765
|
with capture_evaluation_run("test case"):
|
|
725
|
-
if
|
|
766
|
+
if (
|
|
767
|
+
isinstance(test_case, LLMTestCase)
|
|
768
|
+
and not test_case.multimodal
|
|
769
|
+
):
|
|
726
770
|
if len(llm_metrics) == 0:
|
|
727
771
|
continue
|
|
728
772
|
llm_test_case_counter += 1
|
|
@@ -770,7 +814,9 @@ async def a_execute_test_cases(
|
|
|
770
814
|
)
|
|
771
815
|
tasks.append(asyncio.create_task((task)))
|
|
772
816
|
|
|
773
|
-
elif
|
|
817
|
+
elif (
|
|
818
|
+
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
819
|
+
):
|
|
774
820
|
mllm_test_case_counter += 1
|
|
775
821
|
copied_multimodal_metrics: List[BaseMultimodalMetric] = (
|
|
776
822
|
copy_metrics(mllm_metrics)
|
|
@@ -803,7 +849,8 @@ async def a_execute_test_cases(
|
|
|
803
849
|
if not t.done():
|
|
804
850
|
t.cancel()
|
|
805
851
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
806
|
-
|
|
852
|
+
if not error_config.ignore_errors:
|
|
853
|
+
raise
|
|
807
854
|
|
|
808
855
|
return test_results
|
|
809
856
|
|
|
@@ -812,7 +859,7 @@ async def _a_execute_llm_test_cases(
|
|
|
812
859
|
metrics: List[BaseMetric],
|
|
813
860
|
test_case: LLMTestCase,
|
|
814
861
|
test_run_manager: TestRunManager,
|
|
815
|
-
test_results: List[Union[TestResult,
|
|
862
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
816
863
|
count: int,
|
|
817
864
|
test_run: TestRun,
|
|
818
865
|
ignore_errors: bool,
|
|
@@ -931,9 +978,9 @@ async def _a_execute_llm_test_cases(
|
|
|
931
978
|
|
|
932
979
|
async def _a_execute_mllm_test_cases(
|
|
933
980
|
metrics: List[BaseMultimodalMetric],
|
|
934
|
-
test_case:
|
|
981
|
+
test_case: LLMTestCase,
|
|
935
982
|
test_run_manager: TestRunManager,
|
|
936
|
-
test_results: List[Union[TestResult,
|
|
983
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
937
984
|
count: int,
|
|
938
985
|
ignore_errors: bool,
|
|
939
986
|
skip_on_missing_params: bool,
|
|
@@ -1010,7 +1057,7 @@ async def _a_execute_conversational_test_cases(
|
|
|
1010
1057
|
],
|
|
1011
1058
|
test_case: ConversationalTestCase,
|
|
1012
1059
|
test_run_manager: TestRunManager,
|
|
1013
|
-
test_results: List[Union[TestResult,
|
|
1060
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
1014
1061
|
count: int,
|
|
1015
1062
|
ignore_errors: bool,
|
|
1016
1063
|
skip_on_missing_params: bool,
|
|
@@ -1773,7 +1820,7 @@ async def a_execute_agentic_test_cases(
|
|
|
1773
1820
|
async def _a_execute_agentic_test_case(
|
|
1774
1821
|
golden: Golden,
|
|
1775
1822
|
test_run_manager: TestRunManager,
|
|
1776
|
-
test_results: List[Union[TestResult,
|
|
1823
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
1777
1824
|
count: int,
|
|
1778
1825
|
verbose_mode: Optional[bool],
|
|
1779
1826
|
ignore_errors: bool,
|
|
@@ -3202,7 +3249,7 @@ async def _evaluate_test_case_pairs(
|
|
|
3202
3249
|
|
|
3203
3250
|
def _execute_metric(
|
|
3204
3251
|
metric: BaseMetric,
|
|
3205
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
3252
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
3206
3253
|
show_metric_indicator: bool,
|
|
3207
3254
|
in_component: bool,
|
|
3208
3255
|
error_config: ErrorConfig,
|