deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
deepeval/config/utils.py CHANGED
@@ -1,7 +1,8 @@
1
1
  import json
2
2
  import os
3
3
  import re
4
-
4
+ from dotenv import dotenv_values
5
+ from pathlib import Path
5
6
  from typing import Any, Iterable, List, Optional
6
7
 
7
8
 
@@ -142,3 +143,10 @@ def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
142
143
  def constrain_between(value: float, lo: float, hi: float) -> float:
143
144
  """Return value constrained to the inclusive range [lo, hi]."""
144
145
  return min(max(value, lo), hi)
146
+
147
+
148
+ def read_dotenv_file(path: Path) -> dict[str, str]:
149
+ if not path.exists():
150
+ return {}
151
+ values = dotenv_values(path)
152
+ return {key: value for key, value in values.items() if value is not None}
deepeval/dataset/api.py CHANGED
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel, Field
1
+ from pydantic import BaseModel, Field, model_validator
2
2
  from typing import Optional, List
3
3
 
4
4
  from deepeval.dataset.golden import Golden, ConversationalGolden
@@ -11,6 +11,17 @@ class APIDataset(BaseModel):
11
11
  None, alias="conversationalGoldens"
12
12
  )
13
13
 
14
+ @model_validator(mode="after")
15
+ def set_image_mappings_for_goldens(self):
16
+ if self.goldens:
17
+ for golden in self.goldens:
18
+ golden.images_mapping = golden._get_images_mapping()
19
+ if self.conversational_goldens:
20
+ for golden in self.conversational_goldens:
21
+ golden.images_mapping = golden._get_images_mapping()
22
+
23
+ return self
24
+
14
25
 
15
26
  class APIQueueDataset(BaseModel):
16
27
  alias: str
@@ -19,6 +30,17 @@ class APIQueueDataset(BaseModel):
19
30
  None, alias="conversationalGoldens"
20
31
  )
21
32
 
33
+ @model_validator(mode="after")
34
+ def set_image_mappings_for_goldens(self):
35
+ if self.goldens:
36
+ for golden in self.goldens:
37
+ golden.images_mapping = golden._get_images_mapping()
38
+ if self.conversational_goldens:
39
+ for golden in self.conversational_goldens:
40
+ golden.images_mapping = golden._get_images_mapping()
41
+
42
+ return self
43
+
22
44
 
23
45
  class DatasetHttpResponse(BaseModel):
24
46
  id: str
@@ -1,6 +1,8 @@
1
+ import re
1
2
  from pydantic import BaseModel, Field, PrivateAttr, model_validator
2
3
  from typing import Optional, Dict, List
3
4
  from deepeval.test_case import ToolCall, Turn, MLLMImage
5
+ from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
4
6
 
5
7
 
6
8
  class Golden(BaseModel):
@@ -33,6 +35,9 @@ class Golden(BaseModel):
33
35
  default=None, serialization_alias="customColumnKeyValues"
34
36
  )
35
37
  multimodal: bool = Field(False, exclude=True)
38
+ images_mapping: Dict[str, MLLMImage] = Field(
39
+ default=None, alias="imagesMapping"
40
+ )
36
41
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
37
42
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
38
43
  _dataset_id: Optional[str] = PrivateAttr(default=None)
@@ -45,27 +50,60 @@ class Golden(BaseModel):
45
50
  return self
46
51
 
47
52
  pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
48
- self.multimodal = (
53
+ auto_detect = (
49
54
  any(
50
55
  [
51
- (
52
- re.search(pattern, self.input) is not None
53
- if self.input
54
- else False
55
- ),
56
- (
57
- re.search(pattern, self.actual_output) is not None
58
- if self.actual_output
59
- else False
60
- ),
56
+ re.search(pattern, self.input or "") is not None,
57
+ re.search(pattern, self.actual_output or "") is not None,
61
58
  ]
62
59
  )
63
60
  if isinstance(self.input, str)
64
61
  else self.multimodal
65
62
  )
63
+ if self.retrieval_context is not None:
64
+ auto_detect = auto_detect or any(
65
+ re.search(pattern, context) is not None
66
+ for context in self.retrieval_context
67
+ )
68
+ if self.context is not None:
69
+ auto_detect = auto_detect or any(
70
+ re.search(pattern, context) is not None
71
+ for context in self.context
72
+ )
73
+
74
+ self.multimodal = auto_detect
66
75
 
67
76
  return self
68
77
 
78
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
79
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
80
+ image_ids = set()
81
+
82
+ def extract_ids_from_string(s: Optional[str]) -> None:
83
+ """Helper to extract image IDs from a string."""
84
+ if s is not None and isinstance(s, str):
85
+ matches = re.findall(pattern, s)
86
+ image_ids.update(matches)
87
+
88
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
89
+ """Helper to extract image IDs from a list of strings."""
90
+ if lst is not None:
91
+ for item in lst:
92
+ extract_ids_from_string(item)
93
+
94
+ extract_ids_from_string(self.input)
95
+ extract_ids_from_string(self.actual_output)
96
+ extract_ids_from_string(self.expected_output)
97
+ extract_ids_from_list(self.context)
98
+ extract_ids_from_list(self.retrieval_context)
99
+
100
+ images_mapping = {}
101
+ for img_id in image_ids:
102
+ if img_id in _MLLM_IMAGE_REGISTRY:
103
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
104
+
105
+ return images_mapping if len(images_mapping) > 0 else None
106
+
69
107
 
70
108
  class ConversationalGolden(BaseModel):
71
109
  scenario: str
@@ -86,6 +124,9 @@ class ConversationalGolden(BaseModel):
86
124
  )
87
125
  turns: Optional[List[Turn]] = Field(default=None)
88
126
  multimodal: bool = Field(False, exclude=True)
127
+ images_mapping: Dict[str, MLLMImage] = Field(
128
+ default=None, alias="imagesMapping"
129
+ )
89
130
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
90
131
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
91
132
  _dataset_id: Optional[str] = PrivateAttr(default=None)
@@ -98,15 +139,59 @@ class ConversationalGolden(BaseModel):
98
139
  return self
99
140
 
100
141
  pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
101
- self.multimodal = (
102
- any(
103
- [
104
- re.search(pattern, turn.content) is not None
105
- for turn in self.turns
106
- ]
107
- )
108
- if self.turns
109
- else self.multimodal
110
- )
142
+ if self.scenario:
143
+ if re.search(pattern, self.scenario) is not None:
144
+ self.multimodal = True
145
+ return self
146
+ if self.expected_outcome:
147
+ if re.search(pattern, self.expected_outcome) is not None:
148
+ self.multimodal = True
149
+ return self
150
+ if self.user_description:
151
+ if re.search(pattern, self.user_description) is not None:
152
+ self.multimodal = True
153
+ return self
154
+ if self.turns:
155
+ for turn in self.turns:
156
+ if re.search(pattern, turn.content) is not None:
157
+ self.multimodal = True
158
+ return self
159
+ if turn.retrieval_context is not None:
160
+ self.multimodal = any(
161
+ re.search(pattern, context) is not None
162
+ for context in turn.retrieval_context
163
+ )
111
164
 
112
165
  return self
166
+
167
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
168
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
169
+ image_ids = set()
170
+
171
+ def extract_ids_from_string(s: Optional[str]) -> None:
172
+ """Helper to extract image IDs from a string."""
173
+ if s is not None and isinstance(s, str):
174
+ matches = re.findall(pattern, s)
175
+ image_ids.update(matches)
176
+
177
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
178
+ """Helper to extract image IDs from a list of strings."""
179
+ if lst is not None:
180
+ for item in lst:
181
+ extract_ids_from_string(item)
182
+
183
+ extract_ids_from_string(self.scenario)
184
+ extract_ids_from_string(self.expected_outcome)
185
+ extract_ids_from_list(self.context)
186
+ extract_ids_from_string(self.user_description)
187
+ if self.turns:
188
+ for turn in self.turns:
189
+ extract_ids_from_string(turn.content)
190
+ extract_ids_from_list(turn.retrieval_context)
191
+
192
+ images_mapping = {}
193
+ for img_id in image_ids:
194
+ if img_id in _MLLM_IMAGE_REGISTRY:
195
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
196
+
197
+ return images_mapping if len(images_mapping) > 0 else None
@@ -46,7 +46,6 @@ from deepeval.telemetry import capture_evaluation_run
46
46
  from deepeval.metrics import (
47
47
  BaseMetric,
48
48
  BaseConversationalMetric,
49
- BaseMultimodalMetric,
50
49
  )
51
50
  from deepeval.metrics.indicator import (
52
51
  format_metric_description,
@@ -75,7 +74,6 @@ def assert_test(
75
74
  Union[
76
75
  List[BaseMetric],
77
76
  List[BaseConversationalMetric],
78
- List[BaseMultimodalMetric],
79
77
  ]
80
78
  ] = None,
81
79
  golden: Optional[Golden] = None,
@@ -190,7 +188,6 @@ def evaluate(
190
188
  Union[
191
189
  List[BaseMetric],
192
190
  List[BaseConversationalMetric],
193
- List[BaseMultimodalMetric],
194
191
  ]
195
192
  ] = None,
196
193
  # Evals on Confident AI