deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.7.4"
1
+ __version__: str = "3.7.6"
@@ -27,6 +27,7 @@ from pydantic import (
27
27
  field_validator,
28
28
  model_validator,
29
29
  SecretStr,
30
+ PositiveFloat,
30
31
  )
31
32
  from pydantic_settings import BaseSettings, SettingsConfigDict
32
33
  from typing import Any, Dict, List, Optional, NamedTuple
@@ -317,6 +318,19 @@ class Settings(BaseSettings):
317
318
 
318
319
  # Anthropic
319
320
  ANTHROPIC_API_KEY: Optional[SecretStr] = None
321
+ ANTHROPIC_MODEL_NAME: Optional[str] = None
322
+ ANTHROPIC_COST_PER_INPUT_TOKEN: Optional[PositiveFloat] = None
323
+ ANTHROPIC_COST_PER_OUTPUT_TOKEN: Optional[PositiveFloat] = None
324
+
325
+ # AWS
326
+ AWS_ACCESS_KEY_ID: Optional[SecretStr] = None
327
+ AWS_SECRET_ACCESS_KEY: Optional[SecretStr] = None
328
+ # AWS Bedrock
329
+ USE_AWS_BEDROCK_MODEL: Optional[bool] = None
330
+ AWS_BEDROCK_MODEL_NAME: Optional[str] = None
331
+ AWS_BEDROCK_REGION: Optional[str] = None
332
+ AWS_BEDROCK_COST_PER_INPUT_TOKEN: Optional[PositiveFloat] = None
333
+ AWS_BEDROCK_COST_PER_OUTPUT_TOKEN: Optional[PositiveFloat] = None
320
334
  # Azure Open AI
321
335
  AZURE_OPENAI_API_KEY: Optional[SecretStr] = None
322
336
  AZURE_OPENAI_ENDPOINT: Optional[AnyUrl] = None
@@ -329,6 +343,8 @@ class Settings(BaseSettings):
329
343
  USE_DEEPSEEK_MODEL: Optional[bool] = None
330
344
  DEEPSEEK_API_KEY: Optional[SecretStr] = None
331
345
  DEEPSEEK_MODEL_NAME: Optional[str] = None
346
+ DEEPSEEK_COST_PER_INPUT_TOKEN: Optional[float] = None
347
+ DEEPSEEK_COST_PER_OUTPUT_TOKEN: Optional[float] = None
332
348
  # Gemini
333
349
  USE_GEMINI_MODEL: Optional[bool] = None
334
350
  GOOGLE_API_KEY: Optional[SecretStr] = None
@@ -336,11 +352,13 @@ class Settings(BaseSettings):
336
352
  GOOGLE_GENAI_USE_VERTEXAI: Optional[bool] = None
337
353
  GOOGLE_CLOUD_PROJECT: Optional[str] = None
338
354
  GOOGLE_CLOUD_LOCATION: Optional[str] = None
339
- GOOGLE_SERVICE_ACCOUNT_KEY: Optional[str] = None
355
+ GOOGLE_SERVICE_ACCOUNT_KEY: Optional[SecretStr] = None
340
356
  # Grok
341
357
  USE_GROK_MODEL: Optional[bool] = None
342
358
  GROK_API_KEY: Optional[SecretStr] = None
343
359
  GROK_MODEL_NAME: Optional[str] = None
360
+ GROK_COST_PER_INPUT_TOKEN: Optional[float] = None
361
+ GROK_COST_PER_OUTPUT_TOKEN: Optional[float] = None
344
362
  # LiteLLM
345
363
  USE_LITELLM: Optional[bool] = None
346
364
  LITELLM_API_KEY: Optional[SecretStr] = None
@@ -362,6 +380,8 @@ class Settings(BaseSettings):
362
380
  USE_MOONSHOT_MODEL: Optional[bool] = None
363
381
  MOONSHOT_API_KEY: Optional[SecretStr] = None
364
382
  MOONSHOT_MODEL_NAME: Optional[str] = None
383
+ MOONSHOT_COST_PER_INPUT_TOKEN: Optional[float] = None
384
+ MOONSHOT_COST_PER_OUTPUT_TOKEN: Optional[float] = None
365
385
  # Ollama
366
386
  OLLAMA_MODEL_NAME: Optional[str] = None
367
387
  # OpenAI
@@ -388,6 +408,7 @@ class Settings(BaseSettings):
388
408
 
389
409
  # Azure OpenAI
390
410
  USE_AZURE_OPENAI_EMBEDDING: Optional[bool] = None
411
+ AZURE_EMBEDDING_MODEL_NAME: Optional[str] = None
391
412
  AZURE_EMBEDDING_DEPLOYMENT_NAME: Optional[str] = None
392
413
  # Local
393
414
  USE_LOCAL_EMBEDDINGS: Optional[bool] = None
@@ -614,6 +635,7 @@ class Settings(BaseSettings):
614
635
  "SKIP_DEEPEVAL_MISSING_PARAMS",
615
636
  "TOKENIZERS_PARALLELISM",
616
637
  "TRANSFORMERS_NO_ADVISORY_WARNINGS",
638
+ "USE_AWS_BEDROCK_MODEL",
617
639
  "USE_OPENAI_MODEL",
618
640
  "USE_AZURE_OPENAI",
619
641
  "USE_LOCAL_MODEL",
@@ -647,6 +669,8 @@ class Settings(BaseSettings):
647
669
  @field_validator(
648
670
  "OPENAI_COST_PER_INPUT_TOKEN",
649
671
  "OPENAI_COST_PER_OUTPUT_TOKEN",
672
+ "AWS_BEDROCK_COST_PER_INPUT_TOKEN",
673
+ "AWS_BEDROCK_COST_PER_OUTPUT_TOKEN",
650
674
  "TEMPERATURE",
651
675
  "CONFIDENT_TRACE_SAMPLE_RATE",
652
676
  "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
@@ -717,6 +741,16 @@ class Settings(BaseSettings):
717
741
  return None
718
742
  return s.upper()
719
743
 
744
+ @field_validator("AWS_BEDROCK_REGION", mode="before")
745
+ @classmethod
746
+ def _normalize_lower(cls, v):
747
+ if v is None:
748
+ return None
749
+ s = str(v).strip()
750
+ if not s:
751
+ return None
752
+ return s.lower()
753
+
720
754
  @field_validator("DEEPEVAL_SDK_RETRY_PROVIDERS", mode="before")
721
755
  @classmethod
722
756
  def _coerce_to_list(cls, v):
deepeval/dataset/api.py CHANGED
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel, Field
1
+ from pydantic import BaseModel, Field, model_validator
2
2
  from typing import Optional, List
3
3
 
4
4
  from deepeval.dataset.golden import Golden, ConversationalGolden
@@ -11,6 +11,17 @@ class APIDataset(BaseModel):
11
11
  None, alias="conversationalGoldens"
12
12
  )
13
13
 
14
+ @model_validator(mode="after")
15
+ def set_image_mappings_for_goldens(self):
16
+ if self.goldens:
17
+ for golden in self.goldens:
18
+ golden.images_mapping = golden._get_images_mapping()
19
+ if self.conversational_goldens:
20
+ for golden in self.conversational_goldens:
21
+ golden.images_mapping = golden._get_images_mapping()
22
+
23
+ return self
24
+
14
25
 
15
26
  class APIQueueDataset(BaseModel):
16
27
  alias: str
@@ -19,6 +30,17 @@ class APIQueueDataset(BaseModel):
19
30
  None, alias="conversationalGoldens"
20
31
  )
21
32
 
33
+ @model_validator(mode="after")
34
+ def set_image_mappings_for_goldens(self):
35
+ if self.goldens:
36
+ for golden in self.goldens:
37
+ golden.images_mapping = golden._get_images_mapping()
38
+ if self.conversational_goldens:
39
+ for golden in self.conversational_goldens:
40
+ golden.images_mapping = golden._get_images_mapping()
41
+
42
+ return self
43
+
22
44
 
23
45
  class DatasetHttpResponse(BaseModel):
24
46
  id: str
@@ -1,6 +1,8 @@
1
- from pydantic import BaseModel, Field, PrivateAttr
1
+ import re
2
+ from pydantic import BaseModel, Field, PrivateAttr, model_validator
2
3
  from typing import Optional, Dict, List
3
- from deepeval.test_case import ToolCall, Turn
4
+ from deepeval.test_case import ToolCall, Turn, MLLMImage
5
+ from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
4
6
 
5
7
 
6
8
  class Golden(BaseModel):
@@ -32,10 +34,76 @@ class Golden(BaseModel):
32
34
  custom_column_key_values: Optional[Dict[str, str]] = Field(
33
35
  default=None, serialization_alias="customColumnKeyValues"
34
36
  )
37
+ multimodal: bool = Field(False, exclude=True)
38
+ images_mapping: Dict[str, MLLMImage] = Field(
39
+ default=None, alias="imagesMapping"
40
+ )
35
41
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
36
42
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
37
43
  _dataset_id: Optional[str] = PrivateAttr(default=None)
38
44
 
45
+ @model_validator(mode="after")
46
+ def set_is_multimodal(self):
47
+ import re
48
+
49
+ if self.multimodal is True:
50
+ return self
51
+
52
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
53
+ auto_detect = (
54
+ any(
55
+ [
56
+ re.search(pattern, self.input or "") is not None,
57
+ re.search(pattern, self.actual_output or "") is not None,
58
+ ]
59
+ )
60
+ if isinstance(self.input, str)
61
+ else self.multimodal
62
+ )
63
+ if self.retrieval_context is not None:
64
+ auto_detect = auto_detect or any(
65
+ re.search(pattern, context) is not None
66
+ for context in self.retrieval_context
67
+ )
68
+ if self.context is not None:
69
+ auto_detect = auto_detect or any(
70
+ re.search(pattern, context) is not None
71
+ for context in self.context
72
+ )
73
+
74
+ self.multimodal = auto_detect
75
+
76
+ return self
77
+
78
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
79
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
80
+ image_ids = set()
81
+
82
+ def extract_ids_from_string(s: Optional[str]) -> None:
83
+ """Helper to extract image IDs from a string."""
84
+ if s is not None and isinstance(s, str):
85
+ matches = re.findall(pattern, s)
86
+ image_ids.update(matches)
87
+
88
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
89
+ """Helper to extract image IDs from a list of strings."""
90
+ if lst is not None:
91
+ for item in lst:
92
+ extract_ids_from_string(item)
93
+
94
+ extract_ids_from_string(self.input)
95
+ extract_ids_from_string(self.actual_output)
96
+ extract_ids_from_string(self.expected_output)
97
+ extract_ids_from_list(self.context)
98
+ extract_ids_from_list(self.retrieval_context)
99
+
100
+ images_mapping = {}
101
+ for img_id in image_ids:
102
+ if img_id in _MLLM_IMAGE_REGISTRY:
103
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
104
+
105
+ return images_mapping if len(images_mapping) > 0 else None
106
+
39
107
 
40
108
  class ConversationalGolden(BaseModel):
41
109
  scenario: str
@@ -55,6 +123,75 @@ class ConversationalGolden(BaseModel):
55
123
  default=None, serialization_alias="customColumnKeyValues"
56
124
  )
57
125
  turns: Optional[List[Turn]] = Field(default=None)
126
+ multimodal: bool = Field(False, exclude=True)
127
+ images_mapping: Dict[str, MLLMImage] = Field(
128
+ default=None, alias="imagesMapping"
129
+ )
58
130
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
59
131
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
60
132
  _dataset_id: Optional[str] = PrivateAttr(default=None)
133
+
134
+ @model_validator(mode="after")
135
+ def set_is_multimodal(self):
136
+ import re
137
+
138
+ if self.multimodal is True:
139
+ return self
140
+
141
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
142
+ if self.scenario:
143
+ if re.search(pattern, self.scenario) is not None:
144
+ self.multimodal = True
145
+ return self
146
+ if self.expected_outcome:
147
+ if re.search(pattern, self.expected_outcome) is not None:
148
+ self.multimodal = True
149
+ return self
150
+ if self.user_description:
151
+ if re.search(pattern, self.user_description) is not None:
152
+ self.multimodal = True
153
+ return self
154
+ if self.turns:
155
+ for turn in self.turns:
156
+ if re.search(pattern, turn.content) is not None:
157
+ self.multimodal = True
158
+ return self
159
+ if turn.retrieval_context is not None:
160
+ self.multimodal = any(
161
+ re.search(pattern, context) is not None
162
+ for context in turn.retrieval_context
163
+ )
164
+
165
+ return self
166
+
167
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
168
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
169
+ image_ids = set()
170
+
171
+ def extract_ids_from_string(s: Optional[str]) -> None:
172
+ """Helper to extract image IDs from a string."""
173
+ if s is not None and isinstance(s, str):
174
+ matches = re.findall(pattern, s)
175
+ image_ids.update(matches)
176
+
177
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
178
+ """Helper to extract image IDs from a list of strings."""
179
+ if lst is not None:
180
+ for item in lst:
181
+ extract_ids_from_string(item)
182
+
183
+ extract_ids_from_string(self.scenario)
184
+ extract_ids_from_string(self.expected_outcome)
185
+ extract_ids_from_list(self.context)
186
+ extract_ids_from_string(self.user_description)
187
+ if self.turns:
188
+ for turn in self.turns:
189
+ extract_ids_from_string(turn.content)
190
+ extract_ids_from_list(turn.retrieval_context)
191
+
192
+ images_mapping = {}
193
+ for img_id in image_ids:
194
+ if img_id in _MLLM_IMAGE_REGISTRY:
195
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
196
+
197
+ return images_mapping if len(images_mapping) > 0 else None
@@ -46,7 +46,6 @@ from deepeval.telemetry import capture_evaluation_run
46
46
  from deepeval.metrics import (
47
47
  BaseMetric,
48
48
  BaseConversationalMetric,
49
- BaseMultimodalMetric,
50
49
  )
51
50
  from deepeval.metrics.indicator import (
52
51
  format_metric_description,
@@ -54,7 +53,6 @@ from deepeval.metrics.indicator import (
54
53
  from deepeval.test_case import (
55
54
  LLMTestCase,
56
55
  ConversationalTestCase,
57
- MLLMTestCase,
58
56
  )
59
57
  from deepeval.test_run import (
60
58
  global_test_run_manager,
@@ -71,14 +69,11 @@ from deepeval.evaluate.execute import (
71
69
 
72
70
 
73
71
  def assert_test(
74
- test_case: Optional[
75
- Union[LLMTestCase, ConversationalTestCase, MLLMTestCase]
76
- ] = None,
72
+ test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
77
73
  metrics: Optional[
78
74
  Union[
79
75
  List[BaseMetric],
80
76
  List[BaseConversationalMetric],
81
- List[BaseMultimodalMetric],
82
77
  ]
83
78
  ] = None,
84
79
  golden: Optional[Golden] = None,
@@ -175,7 +170,7 @@ def assert_test(
175
170
  try:
176
171
  if not metric_data.success:
177
172
  failed_metrics_data.append(metric_data)
178
- except:
173
+ except Exception:
179
174
  failed_metrics_data.append(metric_data)
180
175
 
181
176
  failed_metrics_str = ", ".join(
@@ -188,14 +183,11 @@ def assert_test(
188
183
 
189
184
 
190
185
  def evaluate(
191
- test_cases: Union[
192
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
193
- ],
186
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
194
187
  metrics: Optional[
195
188
  Union[
196
189
  List[BaseMetric],
197
190
  List[BaseConversationalMetric],
198
- List[BaseMultimodalMetric],
199
191
  ]
200
192
  ] = None,
201
193
  # Evals on Confident AI
@@ -272,6 +264,19 @@ def evaluate(
272
264
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
273
265
  test_run.prompts = process_prompts(hyperparameters)
274
266
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
267
+
268
+ # In CLI mode (`deepeval test run`), the CLI owns finalization and will
269
+ # call `wrap_up_test_run()` once after pytest finishes. Finalizing here
270
+ # as well would double finalize the run and consequently result in
271
+ # duplicate uploads / local saves and temp file races, so only
272
+ # do it when we're NOT in CLI mode.
273
+ if get_is_running_deepeval():
274
+ return EvaluationResult(
275
+ test_results=test_results,
276
+ confident_link=None,
277
+ test_run_id=None,
278
+ )
279
+
275
280
  res = global_test_run_manager.wrap_up_test_run(
276
281
  run_duration, display_table=False
277
282
  )