deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
deepeval/config/utils.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
|
-
|
|
4
|
+
from dotenv import dotenv_values
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
from typing import Any, Iterable, List, Optional
|
|
6
7
|
|
|
7
8
|
|
|
@@ -142,3 +143,10 @@ def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
|
|
142
143
|
def constrain_between(value: float, lo: float, hi: float) -> float:
|
|
143
144
|
"""Return value constrained to the inclusive range [lo, hi]."""
|
|
144
145
|
return min(max(value, lo), hi)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def read_dotenv_file(path: Path) -> dict[str, str]:
|
|
149
|
+
if not path.exists():
|
|
150
|
+
return {}
|
|
151
|
+
values = dotenv_values(path)
|
|
152
|
+
return {key: value for key, value in values.items() if value is not None}
|
deepeval/dataset/api.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field
|
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
|
2
2
|
from typing import Optional, List
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
@@ -11,6 +11,17 @@ class APIDataset(BaseModel):
|
|
|
11
11
|
None, alias="conversationalGoldens"
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
+
@model_validator(mode="after")
|
|
15
|
+
def set_image_mappings_for_goldens(self):
|
|
16
|
+
if self.goldens:
|
|
17
|
+
for golden in self.goldens:
|
|
18
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
19
|
+
if self.conversational_goldens:
|
|
20
|
+
for golden in self.conversational_goldens:
|
|
21
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
22
|
+
|
|
23
|
+
return self
|
|
24
|
+
|
|
14
25
|
|
|
15
26
|
class APIQueueDataset(BaseModel):
|
|
16
27
|
alias: str
|
|
@@ -19,6 +30,17 @@ class APIQueueDataset(BaseModel):
|
|
|
19
30
|
None, alias="conversationalGoldens"
|
|
20
31
|
)
|
|
21
32
|
|
|
33
|
+
@model_validator(mode="after")
|
|
34
|
+
def set_image_mappings_for_goldens(self):
|
|
35
|
+
if self.goldens:
|
|
36
|
+
for golden in self.goldens:
|
|
37
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
38
|
+
if self.conversational_goldens:
|
|
39
|
+
for golden in self.conversational_goldens:
|
|
40
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
41
|
+
|
|
42
|
+
return self
|
|
43
|
+
|
|
22
44
|
|
|
23
45
|
class DatasetHttpResponse(BaseModel):
|
|
24
46
|
id: str
|
deepeval/dataset/golden.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from pydantic import BaseModel, Field, PrivateAttr, model_validator
|
|
2
3
|
from typing import Optional, Dict, List
|
|
3
4
|
from deepeval.test_case import ToolCall, Turn, MLLMImage
|
|
5
|
+
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
class Golden(BaseModel):
|
|
@@ -33,6 +35,9 @@ class Golden(BaseModel):
|
|
|
33
35
|
default=None, serialization_alias="customColumnKeyValues"
|
|
34
36
|
)
|
|
35
37
|
multimodal: bool = Field(False, exclude=True)
|
|
38
|
+
images_mapping: Dict[str, MLLMImage] = Field(
|
|
39
|
+
default=None, alias="imagesMapping"
|
|
40
|
+
)
|
|
36
41
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
37
42
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
38
43
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
@@ -45,27 +50,60 @@ class Golden(BaseModel):
|
|
|
45
50
|
return self
|
|
46
51
|
|
|
47
52
|
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
48
|
-
|
|
53
|
+
auto_detect = (
|
|
49
54
|
any(
|
|
50
55
|
[
|
|
51
|
-
(
|
|
52
|
-
|
|
53
|
-
if self.input
|
|
54
|
-
else False
|
|
55
|
-
),
|
|
56
|
-
(
|
|
57
|
-
re.search(pattern, self.actual_output) is not None
|
|
58
|
-
if self.actual_output
|
|
59
|
-
else False
|
|
60
|
-
),
|
|
56
|
+
re.search(pattern, self.input or "") is not None,
|
|
57
|
+
re.search(pattern, self.actual_output or "") is not None,
|
|
61
58
|
]
|
|
62
59
|
)
|
|
63
60
|
if isinstance(self.input, str)
|
|
64
61
|
else self.multimodal
|
|
65
62
|
)
|
|
63
|
+
if self.retrieval_context is not None:
|
|
64
|
+
auto_detect = auto_detect or any(
|
|
65
|
+
re.search(pattern, context) is not None
|
|
66
|
+
for context in self.retrieval_context
|
|
67
|
+
)
|
|
68
|
+
if self.context is not None:
|
|
69
|
+
auto_detect = auto_detect or any(
|
|
70
|
+
re.search(pattern, context) is not None
|
|
71
|
+
for context in self.context
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self.multimodal = auto_detect
|
|
66
75
|
|
|
67
76
|
return self
|
|
68
77
|
|
|
78
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
79
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
80
|
+
image_ids = set()
|
|
81
|
+
|
|
82
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
83
|
+
"""Helper to extract image IDs from a string."""
|
|
84
|
+
if s is not None and isinstance(s, str):
|
|
85
|
+
matches = re.findall(pattern, s)
|
|
86
|
+
image_ids.update(matches)
|
|
87
|
+
|
|
88
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
89
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
90
|
+
if lst is not None:
|
|
91
|
+
for item in lst:
|
|
92
|
+
extract_ids_from_string(item)
|
|
93
|
+
|
|
94
|
+
extract_ids_from_string(self.input)
|
|
95
|
+
extract_ids_from_string(self.actual_output)
|
|
96
|
+
extract_ids_from_string(self.expected_output)
|
|
97
|
+
extract_ids_from_list(self.context)
|
|
98
|
+
extract_ids_from_list(self.retrieval_context)
|
|
99
|
+
|
|
100
|
+
images_mapping = {}
|
|
101
|
+
for img_id in image_ids:
|
|
102
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
103
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
104
|
+
|
|
105
|
+
return images_mapping if len(images_mapping) > 0 else None
|
|
106
|
+
|
|
69
107
|
|
|
70
108
|
class ConversationalGolden(BaseModel):
|
|
71
109
|
scenario: str
|
|
@@ -86,6 +124,9 @@ class ConversationalGolden(BaseModel):
|
|
|
86
124
|
)
|
|
87
125
|
turns: Optional[List[Turn]] = Field(default=None)
|
|
88
126
|
multimodal: bool = Field(False, exclude=True)
|
|
127
|
+
images_mapping: Dict[str, MLLMImage] = Field(
|
|
128
|
+
default=None, alias="imagesMapping"
|
|
129
|
+
)
|
|
89
130
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
90
131
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
91
132
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
@@ -98,15 +139,59 @@ class ConversationalGolden(BaseModel):
|
|
|
98
139
|
return self
|
|
99
140
|
|
|
100
141
|
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
101
|
-
self.
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
142
|
+
if self.scenario:
|
|
143
|
+
if re.search(pattern, self.scenario) is not None:
|
|
144
|
+
self.multimodal = True
|
|
145
|
+
return self
|
|
146
|
+
if self.expected_outcome:
|
|
147
|
+
if re.search(pattern, self.expected_outcome) is not None:
|
|
148
|
+
self.multimodal = True
|
|
149
|
+
return self
|
|
150
|
+
if self.user_description:
|
|
151
|
+
if re.search(pattern, self.user_description) is not None:
|
|
152
|
+
self.multimodal = True
|
|
153
|
+
return self
|
|
154
|
+
if self.turns:
|
|
155
|
+
for turn in self.turns:
|
|
156
|
+
if re.search(pattern, turn.content) is not None:
|
|
157
|
+
self.multimodal = True
|
|
158
|
+
return self
|
|
159
|
+
if turn.retrieval_context is not None:
|
|
160
|
+
self.multimodal = any(
|
|
161
|
+
re.search(pattern, context) is not None
|
|
162
|
+
for context in turn.retrieval_context
|
|
163
|
+
)
|
|
111
164
|
|
|
112
165
|
return self
|
|
166
|
+
|
|
167
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
168
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
169
|
+
image_ids = set()
|
|
170
|
+
|
|
171
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
172
|
+
"""Helper to extract image IDs from a string."""
|
|
173
|
+
if s is not None and isinstance(s, str):
|
|
174
|
+
matches = re.findall(pattern, s)
|
|
175
|
+
image_ids.update(matches)
|
|
176
|
+
|
|
177
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
178
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
179
|
+
if lst is not None:
|
|
180
|
+
for item in lst:
|
|
181
|
+
extract_ids_from_string(item)
|
|
182
|
+
|
|
183
|
+
extract_ids_from_string(self.scenario)
|
|
184
|
+
extract_ids_from_string(self.expected_outcome)
|
|
185
|
+
extract_ids_from_list(self.context)
|
|
186
|
+
extract_ids_from_string(self.user_description)
|
|
187
|
+
if self.turns:
|
|
188
|
+
for turn in self.turns:
|
|
189
|
+
extract_ids_from_string(turn.content)
|
|
190
|
+
extract_ids_from_list(turn.retrieval_context)
|
|
191
|
+
|
|
192
|
+
images_mapping = {}
|
|
193
|
+
for img_id in image_ids:
|
|
194
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
195
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
196
|
+
|
|
197
|
+
return images_mapping if len(images_mapping) > 0 else None
|
deepeval/evaluate/evaluate.py
CHANGED
|
@@ -46,7 +46,6 @@ from deepeval.telemetry import capture_evaluation_run
|
|
|
46
46
|
from deepeval.metrics import (
|
|
47
47
|
BaseMetric,
|
|
48
48
|
BaseConversationalMetric,
|
|
49
|
-
BaseMultimodalMetric,
|
|
50
49
|
)
|
|
51
50
|
from deepeval.metrics.indicator import (
|
|
52
51
|
format_metric_description,
|
|
@@ -75,7 +74,6 @@ def assert_test(
|
|
|
75
74
|
Union[
|
|
76
75
|
List[BaseMetric],
|
|
77
76
|
List[BaseConversationalMetric],
|
|
78
|
-
List[BaseMultimodalMetric],
|
|
79
77
|
]
|
|
80
78
|
] = None,
|
|
81
79
|
golden: Optional[Golden] = None,
|
|
@@ -190,7 +188,6 @@ def evaluate(
|
|
|
190
188
|
Union[
|
|
191
189
|
List[BaseMetric],
|
|
192
190
|
List[BaseConversationalMetric],
|
|
193
|
-
List[BaseMultimodalMetric],
|
|
194
191
|
]
|
|
195
192
|
] = None,
|
|
196
193
|
# Evals on Confident AI
|