deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -5,9 +5,10 @@ from typing import Optional, List, Union
|
|
|
5
5
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
6
6
|
from deepeval.metrics.utils import (
|
|
7
7
|
construct_verbose_logs,
|
|
8
|
-
trimAndLoadJson,
|
|
9
8
|
check_llm_test_case_params,
|
|
10
9
|
initialize_model,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.test_case import (
|
|
13
14
|
LLMTestCase,
|
|
@@ -60,7 +61,15 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
60
61
|
_log_metric_to_confident: bool = True,
|
|
61
62
|
) -> float:
|
|
62
63
|
|
|
63
|
-
check_llm_test_case_params(
|
|
64
|
+
check_llm_test_case_params(
|
|
65
|
+
test_case,
|
|
66
|
+
self._required_params,
|
|
67
|
+
None,
|
|
68
|
+
None,
|
|
69
|
+
self,
|
|
70
|
+
self.model,
|
|
71
|
+
test_case.multimodal,
|
|
72
|
+
)
|
|
64
73
|
|
|
65
74
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
66
75
|
with metric_progress_indicator(
|
|
@@ -72,6 +81,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
72
81
|
test_case,
|
|
73
82
|
_show_indicator=False,
|
|
74
83
|
_in_component=_in_component,
|
|
84
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
75
85
|
)
|
|
76
86
|
loop.run_until_complete(
|
|
77
87
|
asyncio.wait_for(
|
|
@@ -80,8 +90,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
80
90
|
)
|
|
81
91
|
)
|
|
82
92
|
else:
|
|
83
|
-
self.verdicts: paschema.
|
|
84
|
-
|
|
93
|
+
self.verdicts: List[paschema.PromptAlignmentVerdict] = (
|
|
94
|
+
self._generate_verdicts(
|
|
95
|
+
test_case.input, test_case.actual_output
|
|
96
|
+
)
|
|
85
97
|
)
|
|
86
98
|
self.score = self._calculate_score()
|
|
87
99
|
self.reason = self._generate_reason(
|
|
@@ -111,7 +123,15 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
111
123
|
_log_metric_to_confident: bool = True,
|
|
112
124
|
) -> float:
|
|
113
125
|
|
|
114
|
-
check_llm_test_case_params(
|
|
126
|
+
check_llm_test_case_params(
|
|
127
|
+
test_case,
|
|
128
|
+
self._required_params,
|
|
129
|
+
None,
|
|
130
|
+
None,
|
|
131
|
+
self,
|
|
132
|
+
self.model,
|
|
133
|
+
test_case.multimodal,
|
|
134
|
+
)
|
|
115
135
|
|
|
116
136
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
117
137
|
with metric_progress_indicator(
|
|
@@ -120,8 +140,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
120
140
|
_show_indicator=_show_indicator,
|
|
121
141
|
_in_component=_in_component,
|
|
122
142
|
):
|
|
123
|
-
self.verdicts: paschema.
|
|
124
|
-
|
|
143
|
+
self.verdicts: List[paschema.PromptAlignmentVerdict] = (
|
|
144
|
+
await self._a_generate_verdicts(
|
|
145
|
+
test_case.input, test_case.actual_output
|
|
146
|
+
)
|
|
125
147
|
)
|
|
126
148
|
self.score = self._calculate_score()
|
|
127
149
|
self.reason = await self._a_generate_reason(
|
|
@@ -142,7 +164,9 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
142
164
|
)
|
|
143
165
|
return self.score
|
|
144
166
|
|
|
145
|
-
async def _a_generate_reason(
|
|
167
|
+
async def _a_generate_reason(
|
|
168
|
+
self, input: str, actual_output: str
|
|
169
|
+
) -> Optional[str]:
|
|
146
170
|
if self.include_reason is False:
|
|
147
171
|
return None
|
|
148
172
|
|
|
@@ -157,27 +181,16 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
157
181
|
actual_output=actual_output,
|
|
158
182
|
score=format(self.score, ".2f"),
|
|
159
183
|
)
|
|
160
|
-
if self.using_native_model:
|
|
161
|
-
res, cost = await self.model.a_generate(
|
|
162
|
-
prompt, schema=paschema.PromptAlignmentScoreReason
|
|
163
|
-
)
|
|
164
|
-
self.evaluation_cost += cost
|
|
165
|
-
return res.reason
|
|
166
|
-
else:
|
|
167
|
-
try:
|
|
168
|
-
res: paschema.PromptAlignmentScoreReason = (
|
|
169
|
-
await self.model.a_generate(
|
|
170
|
-
prompt=prompt,
|
|
171
|
-
schema=paschema.PromptAlignmentScoreReason,
|
|
172
|
-
)
|
|
173
|
-
)
|
|
174
|
-
return res.reason
|
|
175
|
-
except TypeError:
|
|
176
|
-
res = await self.model.a_generate(prompt)
|
|
177
|
-
data = trimAndLoadJson(res, self)
|
|
178
|
-
return data["reason"]
|
|
179
184
|
|
|
180
|
-
|
|
185
|
+
return await a_generate_with_schema_and_extract(
|
|
186
|
+
metric=self,
|
|
187
|
+
prompt=prompt,
|
|
188
|
+
schema_cls=paschema.PromptAlignmentScoreReason,
|
|
189
|
+
extract_schema=lambda s: s.reason,
|
|
190
|
+
extract_json=lambda data: data["reason"],
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def _generate_reason(self, input: str, actual_output: str) -> Optional[str]:
|
|
181
194
|
if self.include_reason is False:
|
|
182
195
|
return None
|
|
183
196
|
|
|
@@ -192,78 +205,54 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
192
205
|
actual_output=actual_output,
|
|
193
206
|
score=format(self.score, ".2f"),
|
|
194
207
|
)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
res: paschema.PromptAlignmentScoreReason = self.model.generate(
|
|
204
|
-
prompt=prompt, schema=paschema.PromptAlignmentScoreReason
|
|
205
|
-
)
|
|
206
|
-
return res.reason
|
|
207
|
-
except TypeError:
|
|
208
|
-
res = self.model.generate(prompt)
|
|
209
|
-
data = trimAndLoadJson(res, self)
|
|
210
|
-
return data["reason"]
|
|
208
|
+
|
|
209
|
+
return generate_with_schema_and_extract(
|
|
210
|
+
metric=self,
|
|
211
|
+
prompt=prompt,
|
|
212
|
+
schema_cls=paschema.PromptAlignmentScoreReason,
|
|
213
|
+
extract_schema=lambda s: s.reason,
|
|
214
|
+
extract_json=lambda data: data["reason"],
|
|
215
|
+
)
|
|
211
216
|
|
|
212
217
|
async def _a_generate_verdicts(
|
|
213
218
|
self, input: str, actual_output: str
|
|
214
|
-
) -> paschema.
|
|
219
|
+
) -> List[paschema.PromptAlignmentVerdict]:
|
|
215
220
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
216
221
|
prompt_instructions=self.prompt_instructions,
|
|
217
222
|
input=input,
|
|
218
223
|
actual_output=actual_output,
|
|
219
224
|
)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
)
|
|
231
|
-
return [item for item in res.verdicts]
|
|
232
|
-
except TypeError:
|
|
233
|
-
res = await self.model.a_generate(prompt)
|
|
234
|
-
data = trimAndLoadJson(res, self)
|
|
235
|
-
return [
|
|
236
|
-
paschema.PromptAlignmentVerdict(**item)
|
|
237
|
-
for item in data["verdicts"]
|
|
238
|
-
]
|
|
225
|
+
return await a_generate_with_schema_and_extract(
|
|
226
|
+
metric=self,
|
|
227
|
+
prompt=prompt,
|
|
228
|
+
schema_cls=paschema.Verdicts,
|
|
229
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
230
|
+
extract_json=lambda data: [
|
|
231
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
232
|
+
for item in data["verdicts"]
|
|
233
|
+
],
|
|
234
|
+
)
|
|
239
235
|
|
|
240
236
|
def _generate_verdicts(
|
|
241
237
|
self, input: str, actual_output: str
|
|
242
|
-
) -> paschema.
|
|
238
|
+
) -> List[paschema.PromptAlignmentVerdict]:
|
|
243
239
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
244
240
|
prompt_instructions=self.prompt_instructions,
|
|
245
241
|
input=input,
|
|
246
242
|
actual_output=actual_output,
|
|
247
243
|
)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
except TypeError:
|
|
259
|
-
res = self.model.generate(prompt)
|
|
260
|
-
data = trimAndLoadJson(res, self)
|
|
261
|
-
return [
|
|
262
|
-
paschema.PromptAlignmentVerdict(**item)
|
|
263
|
-
for item in data["verdicts"]
|
|
264
|
-
]
|
|
244
|
+
return generate_with_schema_and_extract(
|
|
245
|
+
metric=self,
|
|
246
|
+
prompt=prompt,
|
|
247
|
+
schema_cls=paschema.Verdicts,
|
|
248
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
249
|
+
extract_json=lambda data: [
|
|
250
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
251
|
+
for item in data["verdicts"]
|
|
252
|
+
],
|
|
253
|
+
)
|
|
265
254
|
|
|
266
|
-
def _calculate_score(self):
|
|
255
|
+
def _calculate_score(self) -> float:
|
|
267
256
|
number_of_verdicts = len(self.verdicts)
|
|
268
257
|
if number_of_verdicts == 0:
|
|
269
258
|
return 1
|
|
@@ -2,6 +2,14 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class PromptAlignmentTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_verdicts(
|
|
7
15
|
prompt_instructions: List[str], input: str, actual_output: str
|
|
@@ -14,6 +22,8 @@ The 'reason' is the reason for the verdict.
|
|
|
14
22
|
Provide a 'reason' ONLY if the answer is 'no'.
|
|
15
23
|
The provided prompt instructions are the instructions to be followed in the prompt, which you have no access to.
|
|
16
24
|
|
|
25
|
+
{PromptAlignmentTemplate.multimodal_rules}
|
|
26
|
+
|
|
17
27
|
**
|
|
18
28
|
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
|
|
19
29
|
Example input: What number is the stars of the sky?
|
|
@@ -63,6 +73,8 @@ The unalignments represent prompt instructions that are not followed by the LLM
|
|
|
63
73
|
If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
64
74
|
Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons.
|
|
65
75
|
|
|
76
|
+
{PromptAlignmentTemplate.multimodal_rules}
|
|
77
|
+
|
|
66
78
|
**
|
|
67
79
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
68
80
|
Example JSON:
|
deepeval/metrics/ragas.py
CHANGED
|
@@ -10,7 +10,7 @@ from deepeval.telemetry import capture_metric_type
|
|
|
10
10
|
|
|
11
11
|
# check langchain availability
|
|
12
12
|
try:
|
|
13
|
-
import langchain_core
|
|
13
|
+
import langchain_core # noqa: F401
|
|
14
14
|
from langchain_core.language_models import BaseChatModel
|
|
15
15
|
from langchain_core.embeddings import Embeddings
|
|
16
16
|
|
|
@@ -501,7 +501,7 @@ class RagasMetric(BaseMetric):
|
|
|
501
501
|
def measure(self, test_case: LLMTestCase):
|
|
502
502
|
# sends to server
|
|
503
503
|
try:
|
|
504
|
-
from ragas import evaluate
|
|
504
|
+
from ragas import evaluate # noqa: F401
|
|
505
505
|
except ModuleNotFoundError:
|
|
506
506
|
raise ModuleNotFoundError(
|
|
507
507
|
"Please install ragas to use this metric. `pip install ragas`."
|
|
@@ -509,7 +509,7 @@ class RagasMetric(BaseMetric):
|
|
|
509
509
|
|
|
510
510
|
try:
|
|
511
511
|
# How do i make sure this isn't just huggingface dataset
|
|
512
|
-
from datasets import Dataset
|
|
512
|
+
from datasets import Dataset # noqa: F401
|
|
513
513
|
except ModuleNotFoundError:
|
|
514
514
|
raise ModuleNotFoundError("Please install dataset")
|
|
515
515
|
|
|
@@ -4,20 +4,21 @@ from deepeval.metrics import BaseConversationalMetric
|
|
|
4
4
|
from deepeval.metrics.api import metric_data_manager
|
|
5
5
|
from deepeval.metrics.role_adherence.schema import (
|
|
6
6
|
OutOfCharacterResponseVerdicts,
|
|
7
|
+
RoleAdherenceScoreReason,
|
|
7
8
|
)
|
|
8
9
|
from deepeval.metrics.role_adherence.template import RoleAdherenceTemplate
|
|
9
10
|
from deepeval.metrics.utils import (
|
|
10
11
|
check_conversational_test_case_params,
|
|
11
12
|
construct_verbose_logs,
|
|
12
13
|
convert_turn_to_dict,
|
|
13
|
-
trimAndLoadJson,
|
|
14
14
|
initialize_model,
|
|
15
|
+
a_generate_with_schema_and_extract,
|
|
16
|
+
generate_with_schema_and_extract,
|
|
15
17
|
)
|
|
16
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
20
|
from deepeval.test_case import Turn, ConversationalTestCase, TurnParams
|
|
19
21
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
20
|
-
from deepeval.metrics.role_adherence.schema import *
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class RoleAdherenceMetric(BaseConversationalMetric):
|
|
@@ -51,7 +52,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
51
52
|
test_case,
|
|
52
53
|
self._required_test_case_params,
|
|
53
54
|
self,
|
|
54
|
-
|
|
55
|
+
True,
|
|
56
|
+
self.model,
|
|
57
|
+
test_case.multimodal,
|
|
55
58
|
)
|
|
56
59
|
|
|
57
60
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -102,7 +105,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
102
105
|
test_case,
|
|
103
106
|
self._required_test_case_params,
|
|
104
107
|
self,
|
|
105
|
-
|
|
108
|
+
True,
|
|
109
|
+
self.model,
|
|
110
|
+
test_case.multimodal,
|
|
106
111
|
)
|
|
107
112
|
|
|
108
113
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -138,7 +143,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
138
143
|
)
|
|
139
144
|
return self.score
|
|
140
145
|
|
|
141
|
-
async def _a_generate_reason(self, role: str) -> str:
|
|
146
|
+
async def _a_generate_reason(self, role: str) -> Optional[str]:
|
|
142
147
|
if self.include_reason is False:
|
|
143
148
|
return None
|
|
144
149
|
|
|
@@ -150,24 +155,17 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
150
155
|
for verdict in self.out_of_character_verdicts.verdicts
|
|
151
156
|
],
|
|
152
157
|
)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
res: RoleAdherenceScoreReason = await self.model.a_generate(
|
|
162
|
-
prompt, schema=RoleAdherenceScoreReason
|
|
163
|
-
)
|
|
164
|
-
return res.reason
|
|
165
|
-
except TypeError:
|
|
166
|
-
res = await self.model.a_generate(prompt)
|
|
167
|
-
data = trimAndLoadJson(res, self)
|
|
168
|
-
return data["reason"]
|
|
158
|
+
return await a_generate_with_schema_and_extract(
|
|
159
|
+
metric=self,
|
|
160
|
+
prompt=prompt,
|
|
161
|
+
schema_cls=RoleAdherenceScoreReason,
|
|
162
|
+
extract_schema=lambda s: s.reason,
|
|
163
|
+
extract_json=lambda data: data["reason"],
|
|
164
|
+
)
|
|
169
165
|
|
|
170
|
-
def _generate_reason(self, role: str) -> str:
|
|
166
|
+
def _generate_reason(self, role: str) -> Optional[str]:
|
|
167
|
+
if self.include_reason is False:
|
|
168
|
+
return None
|
|
171
169
|
prompt = RoleAdherenceTemplate.generate_reason(
|
|
172
170
|
score=self.score,
|
|
173
171
|
role=role,
|
|
@@ -176,22 +174,13 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
176
174
|
for verdict in self.out_of_character_verdicts.verdicts
|
|
177
175
|
],
|
|
178
176
|
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
try:
|
|
187
|
-
res: RoleAdherenceScoreReason = self.model.generate(
|
|
188
|
-
prompt, schema=RoleAdherenceScoreReason
|
|
189
|
-
)
|
|
190
|
-
return res.reason
|
|
191
|
-
except TypeError:
|
|
192
|
-
res = self.model.generate(prompt)
|
|
193
|
-
data = trimAndLoadJson(res, self)
|
|
194
|
-
return data["reason"]
|
|
177
|
+
return generate_with_schema_and_extract(
|
|
178
|
+
metric=self,
|
|
179
|
+
prompt=prompt,
|
|
180
|
+
schema_cls=RoleAdherenceScoreReason,
|
|
181
|
+
extract_schema=lambda s: s.reason,
|
|
182
|
+
extract_json=lambda data: data["reason"],
|
|
183
|
+
)
|
|
195
184
|
|
|
196
185
|
async def _a_extract_out_of_character_verdicts(
|
|
197
186
|
self, turns: List[Turn], role: str
|
|
@@ -202,28 +191,23 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
202
191
|
role=role,
|
|
203
192
|
)
|
|
204
193
|
)
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
194
|
+
res: OutOfCharacterResponseVerdicts = (
|
|
195
|
+
await a_generate_with_schema_and_extract(
|
|
196
|
+
metric=self,
|
|
197
|
+
prompt=prompt,
|
|
198
|
+
schema_cls=OutOfCharacterResponseVerdicts,
|
|
199
|
+
extract_schema=lambda s: s,
|
|
200
|
+
extract_json=lambda data: OutOfCharacterResponseVerdicts(
|
|
201
|
+
**data
|
|
202
|
+
),
|
|
208
203
|
)
|
|
209
|
-
|
|
210
|
-
else:
|
|
211
|
-
try:
|
|
212
|
-
res: OutOfCharacterResponseVerdicts = (
|
|
213
|
-
await self.model.a_generate(
|
|
214
|
-
prompt, schema=OutOfCharacterResponseVerdicts
|
|
215
|
-
)
|
|
216
|
-
)
|
|
217
|
-
except TypeError:
|
|
218
|
-
res = await self.model.a_generate(prompt)
|
|
219
|
-
data = trimAndLoadJson(res, self)
|
|
220
|
-
res = OutOfCharacterResponseVerdicts(**data)
|
|
204
|
+
)
|
|
221
205
|
|
|
222
206
|
for verdict in res.verdicts:
|
|
223
207
|
try:
|
|
224
208
|
index = verdict.index
|
|
225
209
|
verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
|
|
226
|
-
except:
|
|
210
|
+
except Exception:
|
|
227
211
|
pass
|
|
228
212
|
return res
|
|
229
213
|
|
|
@@ -236,26 +220,19 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
236
220
|
role=role,
|
|
237
221
|
)
|
|
238
222
|
)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
res: OutOfCharacterResponseVerdicts = self.model.generate(
|
|
247
|
-
prompt, schema=OutOfCharacterResponseVerdicts
|
|
248
|
-
)
|
|
249
|
-
except TypeError:
|
|
250
|
-
res = self.model.generate(prompt)
|
|
251
|
-
data = trimAndLoadJson(res, self)
|
|
252
|
-
res = OutOfCharacterResponseVerdicts(**data)
|
|
223
|
+
res: OutOfCharacterResponseVerdicts = generate_with_schema_and_extract(
|
|
224
|
+
metric=self,
|
|
225
|
+
prompt=prompt,
|
|
226
|
+
schema_cls=OutOfCharacterResponseVerdicts,
|
|
227
|
+
extract_schema=lambda s: s,
|
|
228
|
+
extract_json=lambda data: OutOfCharacterResponseVerdicts(**data),
|
|
229
|
+
)
|
|
253
230
|
|
|
254
231
|
for verdict in res.verdicts:
|
|
255
232
|
try:
|
|
256
233
|
index = verdict.index
|
|
257
234
|
verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
|
|
258
|
-
except:
|
|
235
|
+
except Exception:
|
|
259
236
|
pass
|
|
260
237
|
return res
|
|
261
238
|
|
|
@@ -278,8 +255,8 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
278
255
|
self.success = False
|
|
279
256
|
else:
|
|
280
257
|
try:
|
|
281
|
-
self.score >= self.threshold
|
|
282
|
-
except:
|
|
258
|
+
self.success = self.score >= self.threshold
|
|
259
|
+
except TypeError:
|
|
283
260
|
self.success = False
|
|
284
261
|
return self.success
|
|
285
262
|
|
|
@@ -2,11 +2,22 @@ from typing import List, Dict
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class RoleAdherenceTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def extract_out_of_character_response_verdicts(
|
|
7
15
|
turns: List[Dict], role: str
|
|
8
16
|
):
|
|
9
17
|
return f"""Based on the given list of message exchanges between a user and an LLM chatbot, generate a JSON object to specify which `ai_message` did not adhere to the specified chatbot role.
|
|
18
|
+
|
|
19
|
+
{RoleAdherenceTemplate.multimodal_rules}
|
|
20
|
+
|
|
10
21
|
The JSON will have 1 field: "verdicts", which is a list of verdicts specifying the indices and reasons of the LLM ai_message/responses that did NOT adhere to the chatbot role.
|
|
11
22
|
You MUST USE look at all messages provided in the list of messages to make an informed judgement on role adherence.
|
|
12
23
|
|
|
@@ -72,6 +83,9 @@ JSON:
|
|
|
72
83
|
return f"""Below is a list of LLM chatbot responses (ai_message) that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
|
|
73
84
|
Given the role adherence score, which is a 0-1 score indicating how well the chatbot responses has adhered to the given role through a conversation, with 1 being the best and 0 being worst, provide a reason by quoting the out of character responses to justify the score.
|
|
74
85
|
|
|
86
|
+
|
|
87
|
+
{RoleAdherenceTemplate.multimodal_rules}
|
|
88
|
+
|
|
75
89
|
**
|
|
76
90
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
77
91
|
Example JSON:
|