deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.7.3"
1
+ __version__: str = "3.7.5"
deepeval/cli/test.py CHANGED
@@ -160,7 +160,7 @@ def run(
160
160
  pytest_args.extend(["--identifier", identifier])
161
161
 
162
162
  # Add the deepeval plugin file to pytest arguments
163
- pytest_args.extend(["-p", "plugins"])
163
+ pytest_args.extend(["-p", "deepeval"])
164
164
  # Append the extra arguments collected by allow_extra_args=True
165
165
  # Pytest will raise its own error if the arguments are invalid (error:
166
166
  if ctx.args:
@@ -49,6 +49,8 @@ _DEPRECATED_TO_OVERRIDE = {
49
49
  "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
50
50
  "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
51
51
  }
52
+ # Track which secrets we've warned about when loading from the legacy keyfile
53
+ _LEGACY_KEYFILE_SECRET_WARNED: set[str] = set()
52
54
 
53
55
 
54
56
  def _find_legacy_enum(env_key: str):
@@ -88,6 +90,82 @@ def _is_secret_key(settings: "Settings", env_key: str) -> bool:
88
90
  return False
89
91
 
90
92
 
93
+ def _merge_legacy_keyfile_into_env() -> None:
94
+ """
95
+ Backwards compatibility: merge values from the legacy .deepeval/.deepeval
96
+ JSON keystore into os.environ for known Settings fields, without
97
+ overwriting existing process env vars.
98
+
99
+ This runs before we compute the Settings env fingerprint so that Pydantic
100
+ can see these values on first construction.
101
+
102
+ Precedence: process env -> dotenv -> legacy json
103
+ """
104
+ # if somebody really wants to skip this behavior
105
+ if parse_bool(os.getenv("DEEPEVAL_DISABLE_LEGACY_KEYFILE"), default=False):
106
+ return
107
+
108
+ from deepeval.constants import HIDDEN_DIR, KEY_FILE
109
+ from deepeval.key_handler import (
110
+ KeyValues,
111
+ ModelKeyValues,
112
+ EmbeddingKeyValues,
113
+ SECRET_KEYS,
114
+ )
115
+
116
+ key_path = Path(HIDDEN_DIR) / KEY_FILE
117
+
118
+ try:
119
+ with key_path.open("r", encoding="utf-8") as f:
120
+ try:
121
+ data = json.load(f)
122
+ except json.JSONDecodeError:
123
+ # Corrupted file -> ignore, same as KeyFileHandler
124
+ return
125
+ except FileNotFoundError:
126
+ # No legacy store -> nothing to merge
127
+ return
128
+
129
+ if not isinstance(data, dict):
130
+ return
131
+
132
+ # Map JSON keys (enum .value) -> env keys (enum .name)
133
+ mapping: Dict[str, str] = {}
134
+ for enum in (KeyValues, ModelKeyValues, EmbeddingKeyValues):
135
+ for member in enum:
136
+ mapping[member.value] = member.name
137
+
138
+ for json_key, raw in data.items():
139
+ env_key = mapping.get(json_key)
140
+ if not env_key:
141
+ continue
142
+
143
+ # Process env always wins
144
+ if env_key in os.environ:
145
+ continue
146
+ if raw is None:
147
+ continue
148
+
149
+ # Mirror the legacy warning semantics for secrets, but only once per key
150
+ if (
151
+ json_key in SECRET_KEYS
152
+ and json_key not in _LEGACY_KEYFILE_SECRET_WARNED
153
+ ):
154
+ logger.warning(
155
+ "Reading secret '%s' from legacy %s/%s. "
156
+ "Persisting API keys in plaintext is deprecated. "
157
+ "Move this to your environment (.env / .env.local). "
158
+ "This fallback will be removed in a future release.",
159
+ json_key,
160
+ HIDDEN_DIR,
161
+ KEY_FILE,
162
+ )
163
+ _LEGACY_KEYFILE_SECRET_WARNED.add(json_key)
164
+
165
+ # Let Settings validators coerce types; we just inject the raw string
166
+ os.environ[env_key] = str(raw)
167
+
168
+
91
169
  def _read_env_file(path: Path) -> Dict[str, str]:
92
170
  if not path.exists():
93
171
  return {}
@@ -258,6 +336,7 @@ class Settings(BaseSettings):
258
336
  GOOGLE_GENAI_USE_VERTEXAI: Optional[bool] = None
259
337
  GOOGLE_CLOUD_PROJECT: Optional[str] = None
260
338
  GOOGLE_CLOUD_LOCATION: Optional[str] = None
339
+ GOOGLE_SERVICE_ACCOUNT_KEY: Optional[str] = None
261
340
  # Grok
262
341
  USE_GROK_MODEL: Optional[bool] = None
263
342
  GROK_API_KEY: Optional[SecretStr] = None
@@ -291,6 +370,12 @@ class Settings(BaseSettings):
291
370
  OPENAI_MODEL_NAME: Optional[str] = None
292
371
  OPENAI_COST_PER_INPUT_TOKEN: Optional[float] = None
293
372
  OPENAI_COST_PER_OUTPUT_TOKEN: Optional[float] = None
373
+ # PortKey
374
+ USE_PORTKEY_MODEL: Optional[bool] = None
375
+ PORTKEY_API_KEY: Optional[SecretStr] = None
376
+ PORTKEY_MODEL_NAME: Optional[str] = None
377
+ PORTKEY_BASE_URL: Optional[AnyUrl] = None
378
+ PORTKEY_PROVIDER_NAME: Optional[str] = None
294
379
  # Vertex AI
295
380
  VERTEX_AI_MODEL_NAME: Optional[str] = None
296
381
  # VLLM
@@ -516,29 +601,30 @@ class Settings(BaseSettings):
516
601
  "CONFIDENT_OPEN_BROWSER",
517
602
  "CONFIDENT_TRACE_FLUSH",
518
603
  "CONFIDENT_TRACE_VERBOSE",
604
+ "CUDA_LAUNCH_BLOCKING",
605
+ "DEEPEVAL_VERBOSE_MODE",
606
+ "DEEPEVAL_GRPC_LOGGING",
607
+ "DEEPEVAL_DISABLE_DOTENV",
608
+ "DEEPEVAL_TELEMETRY_OPT_OUT",
609
+ "DEEPEVAL_UPDATE_WARNING_OPT_IN",
610
+ "ENABLE_DEEPEVAL_CACHE",
611
+ "ERROR_REPORTING",
612
+ "GOOGLE_GENAI_USE_VERTEXAI",
613
+ "IGNORE_DEEPEVAL_ERRORS",
614
+ "SKIP_DEEPEVAL_MISSING_PARAMS",
615
+ "TOKENIZERS_PARALLELISM",
616
+ "TRANSFORMERS_NO_ADVISORY_WARNINGS",
519
617
  "USE_OPENAI_MODEL",
520
618
  "USE_AZURE_OPENAI",
521
619
  "USE_LOCAL_MODEL",
522
620
  "USE_GEMINI_MODEL",
523
- "GOOGLE_GENAI_USE_VERTEXAI",
524
621
  "USE_MOONSHOT_MODEL",
525
622
  "USE_GROK_MODEL",
526
623
  "USE_DEEPSEEK_MODEL",
527
624
  "USE_LITELLM",
528
625
  "USE_AZURE_OPENAI_EMBEDDING",
529
626
  "USE_LOCAL_EMBEDDINGS",
530
- "DEEPEVAL_GRPC_LOGGING",
531
- "DEEPEVAL_DISABLE_DOTENV",
532
- "DEEPEVAL_TELEMETRY_OPT_OUT",
533
- "DEEPEVAL_UPDATE_WARNING_OPT_IN",
534
- "TOKENIZERS_PARALLELISM",
535
- "TRANSFORMERS_NO_ADVISORY_WARNINGS",
536
- "CUDA_LAUNCH_BLOCKING",
537
- "ERROR_REPORTING",
538
- "IGNORE_DEEPEVAL_ERRORS",
539
- "SKIP_DEEPEVAL_MISSING_PARAMS",
540
- "DEEPEVAL_VERBOSE_MODE",
541
- "ENABLE_DEEPEVAL_CACHE",
627
+ "USE_PORTKEY_MODEL",
542
628
  mode="before",
543
629
  )
544
630
  @classmethod
@@ -1008,6 +1094,9 @@ _settings_lock = threading.RLock()
1008
1094
 
1009
1095
 
1010
1096
  def _calc_env_fingerprint() -> str:
1097
+ # Pull legacy .deepeval JSON-based settings into the process env before hashing
1098
+ _merge_legacy_keyfile_into_env()
1099
+
1011
1100
  env = os.environ.copy()
1012
1101
  # must hash in a stable order.
1013
1102
  keys = sorted(
@@ -1,6 +1,6 @@
1
- from pydantic import BaseModel, Field, PrivateAttr
1
+ from pydantic import BaseModel, Field, PrivateAttr, model_validator
2
2
  from typing import Optional, Dict, List
3
- from deepeval.test_case import ToolCall, Turn
3
+ from deepeval.test_case import ToolCall, Turn, MLLMImage
4
4
 
5
5
 
6
6
  class Golden(BaseModel):
@@ -32,10 +32,40 @@ class Golden(BaseModel):
32
32
  custom_column_key_values: Optional[Dict[str, str]] = Field(
33
33
  default=None, serialization_alias="customColumnKeyValues"
34
34
  )
35
+ multimodal: bool = Field(False, exclude=True)
35
36
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
36
37
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
37
38
  _dataset_id: Optional[str] = PrivateAttr(default=None)
38
39
 
40
+ @model_validator(mode="after")
41
+ def set_is_multimodal(self):
42
+ import re
43
+
44
+ if self.multimodal is True:
45
+ return self
46
+
47
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
48
+ self.multimodal = (
49
+ any(
50
+ [
51
+ (
52
+ re.search(pattern, self.input) is not None
53
+ if self.input
54
+ else False
55
+ ),
56
+ (
57
+ re.search(pattern, self.actual_output) is not None
58
+ if self.actual_output
59
+ else False
60
+ ),
61
+ ]
62
+ )
63
+ if isinstance(self.input, str)
64
+ else self.multimodal
65
+ )
66
+
67
+ return self
68
+
39
69
 
40
70
  class ConversationalGolden(BaseModel):
41
71
  scenario: str
@@ -55,6 +85,28 @@ class ConversationalGolden(BaseModel):
55
85
  default=None, serialization_alias="customColumnKeyValues"
56
86
  )
57
87
  turns: Optional[List[Turn]] = Field(default=None)
88
+ multimodal: bool = Field(False, exclude=True)
58
89
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
59
90
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
60
91
  _dataset_id: Optional[str] = PrivateAttr(default=None)
92
+
93
+ @model_validator(mode="after")
94
+ def set_is_multimodal(self):
95
+ import re
96
+
97
+ if self.multimodal is True:
98
+ return self
99
+
100
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
101
+ self.multimodal = (
102
+ any(
103
+ [
104
+ re.search(pattern, turn.content) is not None
105
+ for turn in self.turns
106
+ ]
107
+ )
108
+ if self.turns
109
+ else self.multimodal
110
+ )
111
+
112
+ return self
@@ -7,7 +7,7 @@ from deepeval.test_run.test_run import TestRunResultDisplay
7
7
  @dataclass
8
8
  class AsyncConfig:
9
9
  run_async: bool = True
10
- throttle_value: int = 0
10
+ throttle_value: float = 0
11
11
  max_concurrent: int = 20
12
12
 
13
13
  def __post_init__(self):
@@ -54,7 +54,6 @@ from deepeval.metrics.indicator import (
54
54
  from deepeval.test_case import (
55
55
  LLMTestCase,
56
56
  ConversationalTestCase,
57
- MLLMTestCase,
58
57
  )
59
58
  from deepeval.test_run import (
60
59
  global_test_run_manager,
@@ -71,9 +70,7 @@ from deepeval.evaluate.execute import (
71
70
 
72
71
 
73
72
  def assert_test(
74
- test_case: Optional[
75
- Union[LLMTestCase, ConversationalTestCase, MLLMTestCase]
76
- ] = None,
73
+ test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
77
74
  metrics: Optional[
78
75
  Union[
79
76
  List[BaseMetric],
@@ -175,7 +172,7 @@ def assert_test(
175
172
  try:
176
173
  if not metric_data.success:
177
174
  failed_metrics_data.append(metric_data)
178
- except:
175
+ except Exception:
179
176
  failed_metrics_data.append(metric_data)
180
177
 
181
178
  failed_metrics_str = ", ".join(
@@ -188,9 +185,7 @@ def assert_test(
188
185
 
189
186
 
190
187
  def evaluate(
191
- test_cases: Union[
192
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
193
- ],
188
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
194
189
  metrics: Optional[
195
190
  Union[
196
191
  List[BaseMetric],
@@ -272,6 +267,19 @@ def evaluate(
272
267
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
273
268
  test_run.prompts = process_prompts(hyperparameters)
274
269
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
270
+
271
+ # In CLI mode (`deepeval test run`), the CLI owns finalization and will
272
+ # call `wrap_up_test_run()` once after pytest finishes. Finalizing here
273
+ # as well would double finalize the run and consequently result in
274
+ # duplicate uploads / local saves and temp file races, so only
275
+ # do it when we're NOT in CLI mode.
276
+ if get_is_running_deepeval():
277
+ return EvaluationResult(
278
+ test_results=test_results,
279
+ confident_link=None,
280
+ test_run_id=None,
281
+ )
282
+
275
283
  res = global_test_run_manager.wrap_up_test_run(
276
284
  run_duration, display_table=False
277
285
  )
@@ -58,6 +58,13 @@ from deepeval.metrics import (
58
58
  BaseConversationalMetric,
59
59
  BaseMultimodalMetric,
60
60
  TaskCompletionMetric,
61
+ # RAG metrics that support both single-turn and multimodal
62
+ ContextualPrecisionMetric,
63
+ ContextualRecallMetric,
64
+ ContextualRelevancyMetric,
65
+ AnswerRelevancyMetric,
66
+ FaithfulnessMetric,
67
+ ToolCorrectnessMetric,
61
68
  )
62
69
  from deepeval.metrics.indicator import (
63
70
  measure_metrics_with_indicator,
@@ -70,7 +77,6 @@ from deepeval.models.retry_policy import (
70
77
  from deepeval.test_case import (
71
78
  LLMTestCase,
72
79
  ConversationalTestCase,
73
- MLLMTestCase,
74
80
  )
75
81
  from deepeval.test_case.api import create_api_test_case
76
82
  from deepeval.test_run import (
@@ -110,6 +116,15 @@ from deepeval.test_run.hyperparameters import (
110
116
 
111
117
  logger = logging.getLogger(__name__)
112
118
 
119
+ MLLM_SUPPORTED_METRICS = [
120
+ ContextualPrecisionMetric,
121
+ ContextualRecallMetric,
122
+ ContextualRelevancyMetric,
123
+ AnswerRelevancyMetric,
124
+ FaithfulnessMetric,
125
+ ToolCorrectnessMetric,
126
+ ]
127
+
113
128
 
114
129
  def _skip_metrics_for_error(
115
130
  span: Optional[BaseSpan] = None,
@@ -263,9 +278,7 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
263
278
 
264
279
 
265
280
  def execute_test_cases(
266
- test_cases: Union[
267
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
268
- ],
281
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
269
282
  metrics: Union[
270
283
  List[BaseMetric],
271
284
  List[BaseConversationalMetric],
@@ -307,6 +320,8 @@ def execute_test_cases(
307
320
  metric.async_mode = False
308
321
  if isinstance(metric, BaseMetric):
309
322
  llm_metrics.append(metric)
323
+ if type(metric) in MLLM_SUPPORTED_METRICS:
324
+ mllm_metrics.append(metric)
310
325
  elif isinstance(metric, BaseConversationalMetric):
311
326
  conversational_metrics.append(metric)
312
327
  elif isinstance(metric, BaseMultimodalMetric):
@@ -325,12 +340,12 @@ def execute_test_cases(
325
340
  )
326
341
  for i, test_case in enumerate(test_cases):
327
342
  # skip what we know we won't run
328
- if isinstance(test_case, LLMTestCase):
343
+ if isinstance(test_case, LLMTestCase) and not test_case.multimodal:
329
344
  if not llm_metrics:
330
345
  update_pbar(progress, pbar_id)
331
346
  continue
332
347
  per_case_total = len(llm_metrics)
333
- elif isinstance(test_case, MLLMTestCase):
348
+ elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
334
349
  if not mllm_metrics:
335
350
  update_pbar(progress, pbar_id)
336
351
  continue
@@ -349,10 +364,16 @@ def execute_test_cases(
349
364
 
350
365
  metrics_for_case = (
351
366
  llm_metrics
352
- if isinstance(test_case, LLMTestCase)
367
+ if (
368
+ isinstance(test_case, LLMTestCase)
369
+ and not test_case.multimodal
370
+ )
353
371
  else (
354
372
  mllm_metrics
355
- if isinstance(test_case, MLLMTestCase)
373
+ if (
374
+ isinstance(test_case, LLMTestCase)
375
+ and test_case.multimodal
376
+ )
356
377
  else conversational_metrics
357
378
  )
358
379
  )
@@ -360,10 +381,16 @@ def execute_test_cases(
360
381
  test_case=test_case,
361
382
  index=(
362
383
  llm_test_case_count + 1
363
- if isinstance(test_case, LLMTestCase)
384
+ if (
385
+ isinstance(test_case, LLMTestCase)
386
+ and not test_case.multimodal
387
+ )
364
388
  else (
365
389
  mllm_test_case_count + 1
366
- if isinstance(test_case, MLLMTestCase)
390
+ if (
391
+ isinstance(test_case, LLMTestCase)
392
+ and test_case.multimodal
393
+ )
367
394
  else conversational_test_case_count + 1
368
395
  )
369
396
  ),
@@ -383,7 +410,10 @@ def execute_test_cases(
383
410
  for metric in metrics:
384
411
  metric.error = None # Reset metric error
385
412
 
386
- if isinstance(test_case, LLMTestCase):
413
+ if (
414
+ isinstance(test_case, LLMTestCase)
415
+ and not test_case.multimodal
416
+ ):
387
417
  llm_test_case_count += 1
388
418
  cached_test_case = None
389
419
  if cache_config.use_cache:
@@ -436,7 +466,10 @@ def execute_test_cases(
436
466
  update_pbar(progress, pbar_test_case_id)
437
467
 
438
468
  # No caching and not sending test cases to Confident AI for multimodal metrics yet
439
- elif isinstance(test_case, MLLMTestCase):
469
+ elif (
470
+ isinstance(test_case, LLMTestCase)
471
+ and test_case.multimodal
472
+ ):
440
473
  mllm_test_case_count += 1
441
474
  for metric in mllm_metrics:
442
475
  current_index = index_of[id(metric)]
@@ -560,9 +593,7 @@ def execute_test_cases(
560
593
 
561
594
 
562
595
  async def a_execute_test_cases(
563
- test_cases: Union[
564
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
565
- ],
596
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
566
597
  metrics: Union[
567
598
  List[BaseMetric],
568
599
  List[BaseConversationalMetric],
@@ -605,6 +636,8 @@ async def a_execute_test_cases(
605
636
  for metric in metrics:
606
637
  if isinstance(metric, BaseMetric):
607
638
  llm_metrics.append(metric)
639
+ if type(metric) in MLLM_SUPPORTED_METRICS:
640
+ mllm_metrics.append(metric)
608
641
  elif isinstance(metric, BaseMultimodalMetric):
609
642
  mllm_metrics.append(metric)
610
643
  elif isinstance(metric, BaseConversationalMetric):
@@ -613,7 +646,7 @@ async def a_execute_test_cases(
613
646
  llm_test_case_counter = -1
614
647
  mllm_test_case_counter = -1
615
648
  conversational_test_case_counter = -1
616
- test_results: List[Union[TestResult, MLLMTestCase]] = []
649
+ test_results: List[Union[TestResult, LLMTestCase]] = []
617
650
  tasks = []
618
651
 
619
652
  if display_config.show_indicator and _use_bar_indicator:
@@ -632,7 +665,10 @@ async def a_execute_test_cases(
632
665
  with progress:
633
666
  for test_case in test_cases:
634
667
  with capture_evaluation_run("test case"):
635
- if isinstance(test_case, LLMTestCase):
668
+ if (
669
+ isinstance(test_case, LLMTestCase)
670
+ and not test_case.multimodal
671
+ ):
636
672
  if len(llm_metrics) == 0:
637
673
  update_pbar(progress, pbar_id)
638
674
  continue
@@ -660,7 +696,10 @@ async def a_execute_test_cases(
660
696
  )
661
697
  tasks.append(asyncio.create_task(task))
662
698
 
663
- elif isinstance(test_case, MLLMTestCase):
699
+ elif (
700
+ isinstance(test_case, LLMTestCase)
701
+ and test_case.multimodal
702
+ ):
664
703
  mllm_test_case_counter += 1
665
704
  copied_multimodal_metrics: List[
666
705
  BaseMultimodalMetric
@@ -718,11 +757,16 @@ async def a_execute_test_cases(
718
757
  "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
719
758
  _gather_timeout(),
720
759
  )
760
+ if not error_config.ignore_errors:
761
+ raise
721
762
 
722
763
  else:
723
764
  for test_case in test_cases:
724
765
  with capture_evaluation_run("test case"):
725
- if isinstance(test_case, LLMTestCase):
766
+ if (
767
+ isinstance(test_case, LLMTestCase)
768
+ and not test_case.multimodal
769
+ ):
726
770
  if len(llm_metrics) == 0:
727
771
  continue
728
772
  llm_test_case_counter += 1
@@ -770,7 +814,9 @@ async def a_execute_test_cases(
770
814
  )
771
815
  tasks.append(asyncio.create_task((task)))
772
816
 
773
- elif isinstance(test_case, MLLMTestCase):
817
+ elif (
818
+ isinstance(test_case, LLMTestCase) and test_case.multimodal
819
+ ):
774
820
  mllm_test_case_counter += 1
775
821
  copied_multimodal_metrics: List[BaseMultimodalMetric] = (
776
822
  copy_metrics(mllm_metrics)
@@ -803,7 +849,8 @@ async def a_execute_test_cases(
803
849
  if not t.done():
804
850
  t.cancel()
805
851
  await asyncio.gather(*tasks, return_exceptions=True)
806
- raise
852
+ if not error_config.ignore_errors:
853
+ raise
807
854
 
808
855
  return test_results
809
856
 
@@ -812,7 +859,7 @@ async def _a_execute_llm_test_cases(
812
859
  metrics: List[BaseMetric],
813
860
  test_case: LLMTestCase,
814
861
  test_run_manager: TestRunManager,
815
- test_results: List[Union[TestResult, MLLMTestCase]],
862
+ test_results: List[Union[TestResult, LLMTestCase]],
816
863
  count: int,
817
864
  test_run: TestRun,
818
865
  ignore_errors: bool,
@@ -931,9 +978,9 @@ async def _a_execute_llm_test_cases(
931
978
 
932
979
  async def _a_execute_mllm_test_cases(
933
980
  metrics: List[BaseMultimodalMetric],
934
- test_case: MLLMTestCase,
981
+ test_case: LLMTestCase,
935
982
  test_run_manager: TestRunManager,
936
- test_results: List[Union[TestResult, MLLMTestCase]],
983
+ test_results: List[Union[TestResult, LLMTestCase]],
937
984
  count: int,
938
985
  ignore_errors: bool,
939
986
  skip_on_missing_params: bool,
@@ -1010,7 +1057,7 @@ async def _a_execute_conversational_test_cases(
1010
1057
  ],
1011
1058
  test_case: ConversationalTestCase,
1012
1059
  test_run_manager: TestRunManager,
1013
- test_results: List[Union[TestResult, MLLMTestCase]],
1060
+ test_results: List[Union[TestResult, LLMTestCase]],
1014
1061
  count: int,
1015
1062
  ignore_errors: bool,
1016
1063
  skip_on_missing_params: bool,
@@ -1773,7 +1820,7 @@ async def a_execute_agentic_test_cases(
1773
1820
  async def _a_execute_agentic_test_case(
1774
1821
  golden: Golden,
1775
1822
  test_run_manager: TestRunManager,
1776
- test_results: List[Union[TestResult, MLLMTestCase]],
1823
+ test_results: List[Union[TestResult, LLMTestCase]],
1777
1824
  count: int,
1778
1825
  verbose_mode: Optional[bool],
1779
1826
  ignore_errors: bool,
@@ -3202,7 +3249,7 @@ async def _evaluate_test_case_pairs(
3202
3249
 
3203
3250
  def _execute_metric(
3204
3251
  metric: BaseMetric,
3205
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
3252
+ test_case: Union[LLMTestCase, ConversationalTestCase],
3206
3253
  show_metric_indicator: bool,
3207
3254
  in_component: bool,
3208
3255
  error_config: ErrorConfig,