deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -8,9 +8,10 @@ from deepeval.metrics.conversation_completeness.template import (
|
|
|
8
8
|
from deepeval.metrics.utils import (
|
|
9
9
|
check_conversational_test_case_params,
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
trimAndLoadJson,
|
|
12
11
|
initialize_model,
|
|
13
12
|
convert_turn_to_dict,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
@@ -18,7 +19,11 @@ from deepeval.test_case import ConversationalTestCase
|
|
|
18
19
|
from deepeval.test_case import TurnParams
|
|
19
20
|
from deepeval.test_case.conversational_test_case import Turn
|
|
20
21
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
21
|
-
from deepeval.metrics.conversation_completeness.schema import
|
|
22
|
+
from deepeval.metrics.conversation_completeness.schema import (
|
|
23
|
+
UserIntentions,
|
|
24
|
+
ConversationCompletenessVerdict,
|
|
25
|
+
ConversationCompletenessScoreReason,
|
|
26
|
+
)
|
|
22
27
|
from deepeval.metrics.api import metric_data_manager
|
|
23
28
|
|
|
24
29
|
|
|
@@ -51,8 +56,15 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
51
56
|
_in_component: bool = False,
|
|
52
57
|
_log_metric_to_confident: bool = True,
|
|
53
58
|
):
|
|
59
|
+
|
|
60
|
+
multimodal = test_case.multimodal
|
|
54
61
|
check_conversational_test_case_params(
|
|
55
|
-
test_case,
|
|
62
|
+
test_case,
|
|
63
|
+
self._required_test_case_params,
|
|
64
|
+
self,
|
|
65
|
+
False,
|
|
66
|
+
self.model,
|
|
67
|
+
multimodal,
|
|
56
68
|
)
|
|
57
69
|
|
|
58
70
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -71,17 +83,19 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
71
83
|
)
|
|
72
84
|
else:
|
|
73
85
|
self.user_intentions = self._extract_user_intentions(
|
|
74
|
-
test_case.turns
|
|
86
|
+
test_case.turns, multimodal=multimodal
|
|
75
87
|
)
|
|
76
88
|
self.verdicts = [
|
|
77
89
|
self._generate_verdict(
|
|
78
|
-
turns=test_case.turns,
|
|
90
|
+
turns=test_case.turns,
|
|
91
|
+
intention=user_intention,
|
|
92
|
+
multimodal=multimodal,
|
|
79
93
|
)
|
|
80
94
|
for user_intention in self.user_intentions
|
|
81
95
|
]
|
|
82
96
|
|
|
83
97
|
self.score = self._calculate_score()
|
|
84
|
-
self.reason = self._generate_reason()
|
|
98
|
+
self.reason = self._generate_reason(multimodal=multimodal)
|
|
85
99
|
self.success = self.score >= self.threshold
|
|
86
100
|
self.verbose_logs = construct_verbose_logs(
|
|
87
101
|
self,
|
|
@@ -105,28 +119,40 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
105
119
|
_in_component: bool = False,
|
|
106
120
|
_log_metric_to_confident: bool = True,
|
|
107
121
|
) -> float:
|
|
122
|
+
|
|
123
|
+
multimodal = test_case.multimodal
|
|
108
124
|
check_conversational_test_case_params(
|
|
109
|
-
test_case,
|
|
125
|
+
test_case,
|
|
126
|
+
self._required_test_case_params,
|
|
127
|
+
self,
|
|
128
|
+
False,
|
|
129
|
+
self.model,
|
|
130
|
+
multimodal,
|
|
110
131
|
)
|
|
111
132
|
|
|
112
133
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
113
134
|
with metric_progress_indicator(
|
|
114
|
-
self,
|
|
135
|
+
self,
|
|
136
|
+
async_mode=True,
|
|
137
|
+
_show_indicator=_show_indicator,
|
|
138
|
+
_in_component=_in_component,
|
|
115
139
|
):
|
|
116
140
|
self.user_intentions = await self._a_extract_user_intentions(
|
|
117
|
-
test_case.turns
|
|
141
|
+
test_case.turns, multimodal=multimodal
|
|
118
142
|
)
|
|
119
143
|
self.verdicts = await asyncio.gather(
|
|
120
144
|
*[
|
|
121
145
|
self._a_generate_verdict(
|
|
122
|
-
turns=test_case.turns,
|
|
146
|
+
turns=test_case.turns,
|
|
147
|
+
intention=user_intention,
|
|
148
|
+
multimodal=multimodal,
|
|
123
149
|
)
|
|
124
150
|
for user_intention in self.user_intentions
|
|
125
151
|
]
|
|
126
152
|
)
|
|
127
153
|
|
|
128
154
|
self.score = self._calculate_score()
|
|
129
|
-
self.reason = await self._a_generate_reason()
|
|
155
|
+
self.reason = await self._a_generate_reason(multimodal=multimodal)
|
|
130
156
|
self.success = self.score >= self.threshold
|
|
131
157
|
self.verbose_logs = construct_verbose_logs(
|
|
132
158
|
self,
|
|
@@ -143,7 +169,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
143
169
|
)
|
|
144
170
|
return self.score
|
|
145
171
|
|
|
146
|
-
async def _a_generate_reason(self) -> str:
|
|
172
|
+
async def _a_generate_reason(self, multimodal: bool) -> str:
|
|
147
173
|
incompletenesses: List[str] = []
|
|
148
174
|
for verdict in self.verdicts:
|
|
149
175
|
if verdict.verdict.strip().lower() == "no":
|
|
@@ -153,27 +179,17 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
153
179
|
score=self.score,
|
|
154
180
|
incompletenesses=incompletenesses,
|
|
155
181
|
intentions=self.user_intentions,
|
|
182
|
+
multimodal=multimodal,
|
|
183
|
+
)
|
|
184
|
+
return await a_generate_with_schema_and_extract(
|
|
185
|
+
metric=self,
|
|
186
|
+
prompt=prompt,
|
|
187
|
+
schema_cls=ConversationCompletenessScoreReason,
|
|
188
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
189
|
+
extract_json=lambda data: data["reason"],
|
|
156
190
|
)
|
|
157
|
-
if self.using_native_model:
|
|
158
|
-
res, cost = await self.model.a_generate(
|
|
159
|
-
prompt, schema=ConversationCompletenessScoreReason
|
|
160
|
-
)
|
|
161
|
-
self.evaluation_cost += cost
|
|
162
|
-
return res.reason
|
|
163
|
-
else:
|
|
164
|
-
try:
|
|
165
|
-
res: ConversationCompletenessScoreReason = (
|
|
166
|
-
await self.model.a_generate(
|
|
167
|
-
prompt, schema=ConversationCompletenessScoreReason
|
|
168
|
-
)
|
|
169
|
-
)
|
|
170
|
-
return res.reason
|
|
171
|
-
except TypeError:
|
|
172
|
-
res = await self.model.a_generate(prompt)
|
|
173
|
-
data = trimAndLoadJson(res, self)
|
|
174
|
-
return data["reason"]
|
|
175
191
|
|
|
176
|
-
def _generate_reason(self) -> str:
|
|
192
|
+
def _generate_reason(self, multimodal: bool) -> str:
|
|
177
193
|
if self.include_reason is False:
|
|
178
194
|
return None
|
|
179
195
|
|
|
@@ -186,113 +202,79 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
186
202
|
score=self.score,
|
|
187
203
|
incompletenesses=incompletenesses,
|
|
188
204
|
intentions=self.user_intentions,
|
|
205
|
+
multimodal=multimodal,
|
|
206
|
+
)
|
|
207
|
+
return generate_with_schema_and_extract(
|
|
208
|
+
metric=self,
|
|
209
|
+
prompt=prompt,
|
|
210
|
+
schema_cls=ConversationCompletenessScoreReason,
|
|
211
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
212
|
+
extract_json=lambda data: data["reason"],
|
|
189
213
|
)
|
|
190
|
-
if self.using_native_model:
|
|
191
|
-
res, cost = self.model.generate(
|
|
192
|
-
prompt, schema=ConversationCompletenessScoreReason
|
|
193
|
-
)
|
|
194
|
-
self.evaluation_cost += cost
|
|
195
|
-
return res.reason
|
|
196
|
-
else:
|
|
197
|
-
try:
|
|
198
|
-
res: ConversationCompletenessScoreReason = self.model.generate(
|
|
199
|
-
prompt, schema=ConversationCompletenessScoreReason
|
|
200
|
-
)
|
|
201
|
-
return res.reason
|
|
202
|
-
except TypeError:
|
|
203
|
-
res = self.model.generate(prompt)
|
|
204
|
-
data = trimAndLoadJson(res, self)
|
|
205
|
-
return data["reason"]
|
|
206
214
|
|
|
207
215
|
async def _a_generate_verdict(
|
|
208
|
-
self, turns: List[Turn], intention: str
|
|
216
|
+
self, turns: List[Turn], intention: str, multimodal: bool
|
|
209
217
|
) -> ConversationCompletenessVerdict:
|
|
210
218
|
prompt = ConversationCompletenessTemplate.generate_verdicts(
|
|
211
219
|
turns=[convert_turn_to_dict(turn) for turn in turns],
|
|
212
220
|
intention=intention,
|
|
221
|
+
multimodal=multimodal,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
return await a_generate_with_schema_and_extract(
|
|
225
|
+
metric=self,
|
|
226
|
+
prompt=prompt,
|
|
227
|
+
schema_cls=ConversationCompletenessVerdict,
|
|
228
|
+
extract_schema=lambda r: r,
|
|
229
|
+
extract_json=lambda data: ConversationCompletenessVerdict(**data),
|
|
213
230
|
)
|
|
214
|
-
if self.using_native_model:
|
|
215
|
-
res, cost = await self.model.a_generate(
|
|
216
|
-
prompt, schema=ConversationCompletenessVerdict
|
|
217
|
-
)
|
|
218
|
-
self.evaluation_cost += cost
|
|
219
|
-
return res
|
|
220
|
-
else:
|
|
221
|
-
try:
|
|
222
|
-
res: ConversationCompletenessVerdict = (
|
|
223
|
-
await self.model.a_generate(
|
|
224
|
-
prompt, schema=ConversationCompletenessVerdict
|
|
225
|
-
)
|
|
226
|
-
)
|
|
227
|
-
return res
|
|
228
|
-
except TypeError:
|
|
229
|
-
res = await self.model.a_generate(prompt)
|
|
230
|
-
data = trimAndLoadJson(res, self)
|
|
231
|
-
return ConversationCompletenessVerdict(**data)
|
|
232
231
|
|
|
233
232
|
def _generate_verdict(
|
|
234
|
-
self, turns: List[Turn], intention: str
|
|
233
|
+
self, turns: List[Turn], intention: str, multimodal: bool
|
|
235
234
|
) -> ConversationCompletenessVerdict:
|
|
236
235
|
prompt = ConversationCompletenessTemplate.generate_verdicts(
|
|
237
236
|
turns=[convert_turn_to_dict(turn) for turn in turns],
|
|
238
237
|
intention=intention,
|
|
238
|
+
multimodal=multimodal,
|
|
239
|
+
)
|
|
240
|
+
return generate_with_schema_and_extract(
|
|
241
|
+
metric=self,
|
|
242
|
+
prompt=prompt,
|
|
243
|
+
schema_cls=ConversationCompletenessVerdict,
|
|
244
|
+
extract_schema=lambda r: r,
|
|
245
|
+
extract_json=lambda data: ConversationCompletenessVerdict(**data),
|
|
239
246
|
)
|
|
240
|
-
if self.using_native_model:
|
|
241
|
-
res, cost = self.model.generate(
|
|
242
|
-
prompt, schema=ConversationCompletenessVerdict
|
|
243
|
-
)
|
|
244
|
-
self.evaluation_cost += cost
|
|
245
|
-
return res
|
|
246
|
-
else:
|
|
247
|
-
try:
|
|
248
|
-
res: ConversationCompletenessVerdict = self.model.generate(
|
|
249
|
-
prompt, schema=ConversationCompletenessVerdict
|
|
250
|
-
)
|
|
251
|
-
return res
|
|
252
|
-
except TypeError:
|
|
253
|
-
res = self.model.generate(prompt)
|
|
254
|
-
data = trimAndLoadJson(res, self)
|
|
255
|
-
return ConversationCompletenessVerdict(**data)
|
|
256
247
|
|
|
257
|
-
async def _a_extract_user_intentions(
|
|
248
|
+
async def _a_extract_user_intentions(
|
|
249
|
+
self, turns: List[Turn], multimodal: bool
|
|
250
|
+
) -> List[str]:
|
|
258
251
|
prompt = ConversationCompletenessTemplate.extract_user_intentions(
|
|
259
|
-
turns=[convert_turn_to_dict(turn) for turn in turns]
|
|
252
|
+
turns=[convert_turn_to_dict(turn) for turn in turns],
|
|
253
|
+
multimodal=multimodal,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
return await a_generate_with_schema_and_extract(
|
|
257
|
+
metric=self,
|
|
258
|
+
prompt=prompt,
|
|
259
|
+
schema_cls=UserIntentions,
|
|
260
|
+
extract_schema=lambda r: r.intentions,
|
|
261
|
+
extract_json=lambda data: UserIntentions(**data).intentions,
|
|
260
262
|
)
|
|
261
|
-
if self.using_native_model:
|
|
262
|
-
res, cost = await self.model.a_generate(
|
|
263
|
-
prompt, schema=UserIntentions
|
|
264
|
-
)
|
|
265
|
-
self.evaluation_cost += cost
|
|
266
|
-
return res.intentions
|
|
267
|
-
else:
|
|
268
|
-
try:
|
|
269
|
-
res: UserIntentions = await self.model.a_generate(
|
|
270
|
-
prompt, schema=UserIntentions
|
|
271
|
-
)
|
|
272
|
-
return res.intentions
|
|
273
|
-
except TypeError:
|
|
274
|
-
res = await self.model.a_generate(prompt)
|
|
275
|
-
data = trimAndLoadJson(res, self)
|
|
276
|
-
return UserIntentions(**data).intentions
|
|
277
263
|
|
|
278
|
-
def _extract_user_intentions(
|
|
264
|
+
def _extract_user_intentions(
|
|
265
|
+
self, turns: List[Turn], multimodal: bool
|
|
266
|
+
) -> List[str]:
|
|
279
267
|
prompt = ConversationCompletenessTemplate.extract_user_intentions(
|
|
280
|
-
turns=[convert_turn_to_dict(turn) for turn in turns]
|
|
268
|
+
turns=[convert_turn_to_dict(turn) for turn in turns],
|
|
269
|
+
multimodal=multimodal,
|
|
270
|
+
)
|
|
271
|
+
return generate_with_schema_and_extract(
|
|
272
|
+
metric=self,
|
|
273
|
+
prompt=prompt,
|
|
274
|
+
schema_cls=UserIntentions,
|
|
275
|
+
extract_schema=lambda r: r.intentions,
|
|
276
|
+
extract_json=lambda data: UserIntentions(**data).intentions,
|
|
281
277
|
)
|
|
282
|
-
if self.using_native_model:
|
|
283
|
-
res, cost = self.model.generate(prompt, schema=UserIntentions)
|
|
284
|
-
self.evaluation_cost += cost
|
|
285
|
-
return res.intentions
|
|
286
|
-
else:
|
|
287
|
-
try:
|
|
288
|
-
res: UserIntentions = self.model.generate(
|
|
289
|
-
prompt, schema=UserIntentions
|
|
290
|
-
)
|
|
291
|
-
return res.intentions
|
|
292
|
-
except TypeError:
|
|
293
|
-
res = self.model.generate(prompt)
|
|
294
|
-
data = trimAndLoadJson(res, self)
|
|
295
|
-
return UserIntentions(**data).intentions
|
|
296
278
|
|
|
297
279
|
def _calculate_score(self) -> float:
|
|
298
280
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -312,8 +294,8 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
312
294
|
self.success = False
|
|
313
295
|
else:
|
|
314
296
|
try:
|
|
315
|
-
self.score >= self.threshold
|
|
316
|
-
except:
|
|
297
|
+
self.success = self.score >= self.threshold
|
|
298
|
+
except TypeError:
|
|
317
299
|
self.success = False
|
|
318
300
|
return self.success
|
|
319
301
|
|
|
@@ -2,11 +2,21 @@ from typing import List, Dict
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class ConversationCompletenessTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
|
-
def extract_user_intentions(turns: List[Dict]):
|
|
14
|
+
def extract_user_intentions(turns: List[Dict], multimodal: bool = False):
|
|
7
15
|
return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to extract all user intentions in the conversation. The JSON will have 1 field: 'intentions'.
|
|
8
16
|
You should ONLY consider the overall intention, and not dwell too much on the specifics, as we are more concerned about the overall objective of the conversation.
|
|
9
17
|
|
|
18
|
+
{ConversationCompletenessTemplate.multimodal_rules if multimodal else ""}
|
|
19
|
+
|
|
10
20
|
**
|
|
11
21
|
IMPORTANT: Please make sure to only return in JSON format.
|
|
12
22
|
Example Turns:
|
|
@@ -49,8 +59,13 @@ JSON:
|
|
|
49
59
|
"""
|
|
50
60
|
|
|
51
61
|
@staticmethod
|
|
52
|
-
def generate_verdicts(
|
|
62
|
+
def generate_verdicts(
|
|
63
|
+
turns: List[Dict], intention: str, multimodal: bool = False
|
|
64
|
+
):
|
|
53
65
|
return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether given user intention was satisfied from the conversation messages. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
66
|
+
|
|
67
|
+
{ConversationCompletenessTemplate.multimodal_rules if multimodal else ""}
|
|
68
|
+
|
|
54
69
|
The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the user intention was satisfied or not.
|
|
55
70
|
Provide a 'reason' ONLY if the answer is 'no'.
|
|
56
71
|
You MUST USE look at all messages provided in the list of messages to make an informed judgement on satisfaction.
|
|
@@ -106,8 +121,13 @@ JSON:
|
|
|
106
121
|
"""
|
|
107
122
|
|
|
108
123
|
@staticmethod
|
|
109
|
-
def generate_reason(
|
|
124
|
+
def generate_reason(
|
|
125
|
+
score, incompletenesses, intentions, multimodal: bool = False
|
|
126
|
+
):
|
|
110
127
|
return f"""Below is a list of incompletenesses drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why an LLM 'actual_output' is incomplete to satisfy the user `input` for a particular message.
|
|
128
|
+
|
|
129
|
+
{ConversationCompletenessTemplate.multimodal_rules if multimodal else ""}
|
|
130
|
+
|
|
111
131
|
Given the completeness score, which is a 0-1 score indicating how incomplete the OVERALL `actual_output`s are to the user intentions found in the `input`s of a conversation (higher the better), CONCISELY summarize the incompletenesses to justify the score.
|
|
112
132
|
|
|
113
133
|
**
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
2
|
from deepeval.metrics import BaseConversationalMetric
|
|
3
3
|
from deepeval.test_case import (
|
|
4
4
|
ConversationalTestCase,
|
|
@@ -11,7 +11,6 @@ from deepeval.metrics.utils import (
|
|
|
11
11
|
)
|
|
12
12
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
13
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
|
-
from deepeval.metrics.g_eval.schema import *
|
|
15
14
|
from deepeval.metrics import DeepAcyclicGraph
|
|
16
15
|
from deepeval.metrics.dag.utils import (
|
|
17
16
|
is_valid_dag_from_roots,
|
|
@@ -35,11 +34,8 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
35
34
|
verbose_mode: bool = False,
|
|
36
35
|
_include_dag_suffix: bool = True,
|
|
37
36
|
):
|
|
38
|
-
if (
|
|
39
|
-
|
|
40
|
-
root_nodes=dag.root_nodes, multiturn=dag.multiturn
|
|
41
|
-
)
|
|
42
|
-
== False
|
|
37
|
+
if not is_valid_dag_from_roots(
|
|
38
|
+
root_nodes=dag.root_nodes, multiturn=dag.multiturn
|
|
43
39
|
):
|
|
44
40
|
raise ValueError("Cycle detected in DAG graph.")
|
|
45
41
|
|
|
@@ -62,10 +58,14 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
62
58
|
_in_component: bool = False,
|
|
63
59
|
_log_metric_to_confident: bool = True,
|
|
64
60
|
) -> float:
|
|
61
|
+
multimodal = test_case.multimodal
|
|
65
62
|
check_conversational_test_case_params(
|
|
66
63
|
test_case,
|
|
67
64
|
extract_required_params(self.dag.root_nodes, multiturn=True),
|
|
68
65
|
self,
|
|
66
|
+
False,
|
|
67
|
+
self.model,
|
|
68
|
+
multimodal,
|
|
69
69
|
)
|
|
70
70
|
|
|
71
71
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -105,10 +105,14 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
105
105
|
_in_component: bool = False,
|
|
106
106
|
_log_metric_to_confident: bool = True,
|
|
107
107
|
) -> float:
|
|
108
|
+
multimodal = test_case.multimodal
|
|
108
109
|
check_conversational_test_case_params(
|
|
109
110
|
test_case,
|
|
110
111
|
extract_required_params(self.dag.root_nodes, multiturn=True),
|
|
111
112
|
self,
|
|
113
|
+
False,
|
|
114
|
+
self.model,
|
|
115
|
+
multimodal,
|
|
112
116
|
)
|
|
113
117
|
|
|
114
118
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -139,7 +143,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
139
143
|
else:
|
|
140
144
|
try:
|
|
141
145
|
self.success = self.score >= self.threshold
|
|
142
|
-
except:
|
|
146
|
+
except TypeError:
|
|
143
147
|
self.success = False
|
|
144
148
|
return self.success
|
|
145
149
|
|