deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -3,11 +3,23 @@ import textwrap
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class GEvalTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
6
14
|
@staticmethod
|
|
7
|
-
def generate_evaluation_steps(
|
|
15
|
+
def generate_evaluation_steps(
|
|
16
|
+
parameters: str, criteria: str, multimodal: bool = False
|
|
17
|
+
):
|
|
8
18
|
return textwrap.dedent(
|
|
9
19
|
f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
|
|
10
20
|
|
|
21
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
22
|
+
|
|
11
23
|
Evaluation Criteria:
|
|
12
24
|
{criteria}
|
|
13
25
|
|
|
@@ -31,6 +43,7 @@ class GEvalTemplate:
|
|
|
31
43
|
rubric: Optional[str] = None,
|
|
32
44
|
score_range: Tuple[int, int] = (0, 10),
|
|
33
45
|
_additional_context: Optional[str] = None,
|
|
46
|
+
multimodal: bool = False,
|
|
34
47
|
):
|
|
35
48
|
rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
|
|
36
49
|
dependencies = (
|
|
@@ -62,6 +75,7 @@ class GEvalTemplate:
|
|
|
62
75
|
- {reasoning_expectation}
|
|
63
76
|
- Mention key details from the test case parameters.
|
|
64
77
|
- Be concise, clear, and focused on the evaluation logic.
|
|
78
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
65
79
|
|
|
66
80
|
Only return valid JSON. Do **not** include any extra commentary or text.
|
|
67
81
|
|
|
@@ -95,6 +109,7 @@ class GEvalTemplate:
|
|
|
95
109
|
test_case_content: str,
|
|
96
110
|
parameters: str,
|
|
97
111
|
_additional_context: Optional[str] = None,
|
|
112
|
+
multimodal: bool = False,
|
|
98
113
|
):
|
|
99
114
|
additional_context = (
|
|
100
115
|
f"\n\nAdditional Context:\n{_additional_context}\n"
|
|
@@ -104,6 +119,8 @@ class GEvalTemplate:
|
|
|
104
119
|
return textwrap.dedent(
|
|
105
120
|
f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
|
|
106
121
|
|
|
122
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
123
|
+
|
|
107
124
|
Evaluation Steps:
|
|
108
125
|
{evaluation_steps}
|
|
109
126
|
|
deepeval/metrics/g_eval/utils.py
CHANGED
|
@@ -9,8 +9,8 @@ from deepeval.test_case import (
|
|
|
9
9
|
LLMTestCase,
|
|
10
10
|
ToolCall,
|
|
11
11
|
)
|
|
12
|
-
from deepeval.models.llms.openai_model import unsupported_log_probs_gpt_models
|
|
13
12
|
from pydantic import BaseModel, field_validator
|
|
13
|
+
from deepeval.models.llms.constants import OPENAI_MODELS_DATA
|
|
14
14
|
|
|
15
15
|
from deepeval.test_case.conversational_test_case import ConversationalTestCase
|
|
16
16
|
|
|
@@ -114,16 +114,17 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
|
|
|
114
114
|
|
|
115
115
|
def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
|
|
116
116
|
|
|
117
|
-
if isinstance(model, str)
|
|
118
|
-
|
|
117
|
+
if isinstance(model, str):
|
|
118
|
+
model_data = OPENAI_MODELS_DATA.get(model)
|
|
119
|
+
if not model_data.supports_log_probs:
|
|
120
|
+
return True
|
|
119
121
|
elif (
|
|
120
|
-
isinstance(model, GPTModel)
|
|
121
|
-
and model.model_name in unsupported_log_probs_gpt_models
|
|
122
|
+
isinstance(model, GPTModel) and not model.model_data.supports_log_probs
|
|
122
123
|
):
|
|
123
124
|
return True
|
|
124
125
|
elif (
|
|
125
126
|
isinstance(model, AzureOpenAIModel)
|
|
126
|
-
and model.
|
|
127
|
+
and not model.model_data.supports_log_probs
|
|
127
128
|
):
|
|
128
129
|
return True
|
|
129
130
|
|
|
@@ -3,11 +3,12 @@ import asyncio
|
|
|
3
3
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
get_unit_interactions,
|
|
8
7
|
print_tools_called,
|
|
9
8
|
check_conversational_test_case_params,
|
|
10
9
|
initialize_model,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
13
14
|
from deepeval.metrics import BaseConversationalMetric
|
|
@@ -55,8 +56,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
55
56
|
_in_component: bool = False,
|
|
56
57
|
_log_metric_to_confident: bool = True,
|
|
57
58
|
):
|
|
59
|
+
multimodal = test_case.multimodal
|
|
58
60
|
check_conversational_test_case_params(
|
|
59
|
-
test_case,
|
|
61
|
+
test_case,
|
|
62
|
+
self._required_test_case_params,
|
|
63
|
+
self,
|
|
64
|
+
None,
|
|
65
|
+
self.model,
|
|
66
|
+
multimodal,
|
|
60
67
|
)
|
|
61
68
|
|
|
62
69
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -80,17 +87,21 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
80
87
|
)
|
|
81
88
|
goal_scores = [
|
|
82
89
|
self._get_goal_accuracy_score(
|
|
83
|
-
task.user_goal, task.steps_taken
|
|
90
|
+
task.user_goal, task.steps_taken, multimodal
|
|
84
91
|
)
|
|
85
92
|
for task in goal_and_steps_taken
|
|
86
93
|
]
|
|
87
94
|
plan_scores = [
|
|
88
|
-
self._get_plan_scores(
|
|
95
|
+
self._get_plan_scores(
|
|
96
|
+
task.user_goal, task.steps_taken, multimodal
|
|
97
|
+
)
|
|
89
98
|
for task in goal_and_steps_taken
|
|
90
99
|
]
|
|
91
100
|
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
92
101
|
self.success = self.score >= self.threshold
|
|
93
|
-
self.reason = self._generate_reason(
|
|
102
|
+
self.reason = self._generate_reason(
|
|
103
|
+
goal_scores, plan_scores, multimodal
|
|
104
|
+
)
|
|
94
105
|
|
|
95
106
|
self.verbose_logs = construct_verbose_logs(
|
|
96
107
|
self,
|
|
@@ -117,8 +128,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
117
128
|
_in_component: bool = False,
|
|
118
129
|
_log_metric_to_confident: bool = True,
|
|
119
130
|
):
|
|
131
|
+
multimodal = test_case.multimodal
|
|
120
132
|
check_conversational_test_case_params(
|
|
121
|
-
test_case,
|
|
133
|
+
test_case,
|
|
134
|
+
self._required_test_case_params,
|
|
135
|
+
self,
|
|
136
|
+
None,
|
|
137
|
+
self.model,
|
|
138
|
+
multimodal,
|
|
122
139
|
)
|
|
123
140
|
|
|
124
141
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -134,21 +151,23 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
134
151
|
goal_scores = await asyncio.gather(
|
|
135
152
|
*[
|
|
136
153
|
self._a_get_goal_accuracy_score(
|
|
137
|
-
task.user_goal, task.steps_taken
|
|
154
|
+
task.user_goal, task.steps_taken, multimodal
|
|
138
155
|
)
|
|
139
156
|
for task in goal_and_steps_taken
|
|
140
157
|
]
|
|
141
158
|
)
|
|
142
159
|
plan_scores = await asyncio.gather(
|
|
143
160
|
*[
|
|
144
|
-
self._a_get_plan_scores(
|
|
161
|
+
self._a_get_plan_scores(
|
|
162
|
+
task.user_goal, task.steps_taken, multimodal
|
|
163
|
+
)
|
|
145
164
|
for task in goal_and_steps_taken
|
|
146
165
|
]
|
|
147
166
|
)
|
|
148
167
|
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
149
168
|
self.success = self.score >= self.threshold
|
|
150
169
|
self.reason = await self._a_generate_reason(
|
|
151
|
-
goal_scores, plan_scores
|
|
170
|
+
goal_scores, plan_scores, multimodal
|
|
152
171
|
)
|
|
153
172
|
|
|
154
173
|
self.verbose_logs = construct_verbose_logs(
|
|
@@ -191,41 +210,31 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
191
210
|
goal_and_steps_taken.append(new_goal_steps)
|
|
192
211
|
return goal_and_steps_taken
|
|
193
212
|
|
|
194
|
-
def _get_plan_scores(self, user_goal, steps_taken):
|
|
213
|
+
def _get_plan_scores(self, user_goal, steps_taken, multimodal: bool):
|
|
195
214
|
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
196
|
-
user_goal, "\n".join(steps_taken)
|
|
215
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
216
|
+
)
|
|
217
|
+
return generate_with_schema_and_extract(
|
|
218
|
+
metric=self,
|
|
219
|
+
prompt=prompt,
|
|
220
|
+
schema_cls=PlanScore,
|
|
221
|
+
extract_schema=lambda s: s,
|
|
222
|
+
extract_json=lambda data: PlanScore(**data),
|
|
197
223
|
)
|
|
198
|
-
if self.using_native_model:
|
|
199
|
-
res, cost = self.model.generate(prompt, schema=PlanScore)
|
|
200
|
-
self.evaluation_cost += cost
|
|
201
|
-
return res
|
|
202
|
-
else:
|
|
203
|
-
try:
|
|
204
|
-
res: PlanScore = self.model.generate(prompt, schema=PlanScore)
|
|
205
|
-
return res
|
|
206
|
-
except TypeError:
|
|
207
|
-
res = self.model.generate(prompt)
|
|
208
|
-
data = trimAndLoadJson(res, self)
|
|
209
|
-
return PlanScore(**data)
|
|
210
224
|
|
|
211
|
-
async def _a_get_plan_scores(
|
|
225
|
+
async def _a_get_plan_scores(
|
|
226
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
227
|
+
):
|
|
212
228
|
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
213
|
-
user_goal, "\n".join(steps_taken)
|
|
229
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
230
|
+
)
|
|
231
|
+
return await a_generate_with_schema_and_extract(
|
|
232
|
+
metric=self,
|
|
233
|
+
prompt=prompt,
|
|
234
|
+
schema_cls=PlanScore,
|
|
235
|
+
extract_schema=lambda s: s,
|
|
236
|
+
extract_json=lambda data: PlanScore(**data),
|
|
214
237
|
)
|
|
215
|
-
if self.using_native_model:
|
|
216
|
-
res, cost = await self.model.a_generate(prompt, schema=PlanScore)
|
|
217
|
-
self.evaluation_cost += cost
|
|
218
|
-
return res
|
|
219
|
-
else:
|
|
220
|
-
try:
|
|
221
|
-
res: PlanScore = await self.model.a_generate(
|
|
222
|
-
prompt, schema=PlanScore
|
|
223
|
-
)
|
|
224
|
-
return res
|
|
225
|
-
except TypeError:
|
|
226
|
-
res = await self.model.a_generate(prompt)
|
|
227
|
-
data = trimAndLoadJson(res, self)
|
|
228
|
-
return PlanScore(**data)
|
|
229
238
|
|
|
230
239
|
def _calculate_score(
|
|
231
240
|
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
@@ -240,7 +249,10 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
240
249
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
241
250
|
|
|
242
251
|
def _generate_reason(
|
|
243
|
-
self,
|
|
252
|
+
self,
|
|
253
|
+
goal_scores: List[GoalScore],
|
|
254
|
+
plan_scores: List[PlanScore],
|
|
255
|
+
multimodal: bool,
|
|
244
256
|
):
|
|
245
257
|
goal_evaluations = ""
|
|
246
258
|
for goal_score in goal_scores:
|
|
@@ -254,18 +266,25 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
254
266
|
)
|
|
255
267
|
|
|
256
268
|
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
257
|
-
self.score,
|
|
269
|
+
self.score,
|
|
270
|
+
self.threshold,
|
|
271
|
+
goal_evaluations,
|
|
272
|
+
plan_evalautions,
|
|
273
|
+
multimodal,
|
|
258
274
|
)
|
|
259
275
|
if self.using_native_model:
|
|
260
276
|
res, cost = self.model.generate(prompt)
|
|
261
|
-
self.
|
|
277
|
+
self._accrue_cost(cost)
|
|
262
278
|
return res
|
|
263
279
|
else:
|
|
264
280
|
res = self.model.generate(prompt)
|
|
265
281
|
return res
|
|
266
282
|
|
|
267
283
|
async def _a_generate_reason(
|
|
268
|
-
self,
|
|
284
|
+
self,
|
|
285
|
+
goal_scores: List[GoalScore],
|
|
286
|
+
plan_scores: List[PlanScore],
|
|
287
|
+
multimodal: bool,
|
|
269
288
|
):
|
|
270
289
|
goal_evaluations = ""
|
|
271
290
|
for goal_score in goal_scores:
|
|
@@ -279,51 +298,47 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
279
298
|
)
|
|
280
299
|
|
|
281
300
|
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
282
|
-
self.score,
|
|
301
|
+
self.score,
|
|
302
|
+
self.threshold,
|
|
303
|
+
goal_evaluations,
|
|
304
|
+
plan_evalautions,
|
|
305
|
+
multimodal,
|
|
283
306
|
)
|
|
284
307
|
if self.using_native_model:
|
|
285
308
|
res, cost = await self.model.a_generate(prompt)
|
|
286
|
-
self.
|
|
309
|
+
self._accrue_cost(cost)
|
|
287
310
|
return res
|
|
288
311
|
else:
|
|
289
312
|
res = await self.model.a_generate(prompt)
|
|
290
313
|
return res
|
|
291
314
|
|
|
292
|
-
def _get_goal_accuracy_score(
|
|
315
|
+
def _get_goal_accuracy_score(
|
|
316
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
317
|
+
):
|
|
293
318
|
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
294
|
-
user_goal, "\n".join(steps_taken)
|
|
319
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
320
|
+
)
|
|
321
|
+
return generate_with_schema_and_extract(
|
|
322
|
+
metric=self,
|
|
323
|
+
prompt=prompt,
|
|
324
|
+
schema_cls=GoalScore,
|
|
325
|
+
extract_schema=lambda s: s,
|
|
326
|
+
extract_json=lambda data: GoalScore(**data),
|
|
295
327
|
)
|
|
296
|
-
if self.using_native_model:
|
|
297
|
-
res, cost = self.model.generate(prompt, schema=GoalScore)
|
|
298
|
-
self.evaluation_cost += cost
|
|
299
|
-
return res
|
|
300
|
-
else:
|
|
301
|
-
try:
|
|
302
|
-
res: GoalScore = self.model.generate(prompt, schema=GoalScore)
|
|
303
|
-
return res
|
|
304
|
-
except TypeError:
|
|
305
|
-
res = self.model.generate(prompt)
|
|
306
|
-
data = trimAndLoadJson(res, self)
|
|
307
|
-
return GoalScore(**data)
|
|
308
328
|
|
|
309
|
-
async def _a_get_goal_accuracy_score(
|
|
329
|
+
async def _a_get_goal_accuracy_score(
|
|
330
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
331
|
+
):
|
|
310
332
|
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
311
|
-
user_goal, "\n".join(steps_taken)
|
|
333
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
334
|
+
)
|
|
335
|
+
return await a_generate_with_schema_and_extract(
|
|
336
|
+
metric=self,
|
|
337
|
+
prompt=prompt,
|
|
338
|
+
schema_cls=GoalScore,
|
|
339
|
+
extract_schema=lambda s: s,
|
|
340
|
+
extract_json=lambda data: GoalScore(**data),
|
|
312
341
|
)
|
|
313
|
-
if self.using_native_model:
|
|
314
|
-
res, cost = await self.model.a_generate(prompt, schema=GoalScore)
|
|
315
|
-
self.evaluation_cost += cost
|
|
316
|
-
return res
|
|
317
|
-
else:
|
|
318
|
-
try:
|
|
319
|
-
res: GoalScore = await self.model.a_generate(
|
|
320
|
-
prompt, schema=GoalScore
|
|
321
|
-
)
|
|
322
|
-
return res
|
|
323
|
-
except TypeError:
|
|
324
|
-
res = await self.model.a_generate(prompt)
|
|
325
|
-
data = trimAndLoadJson(res, self)
|
|
326
|
-
return GoalScore(**data)
|
|
327
342
|
|
|
328
343
|
def print_goals_and_steps_taken(self, goals_and_steps):
|
|
329
344
|
final_goals_and_steps = ""
|
|
@@ -340,7 +355,7 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
340
355
|
else:
|
|
341
356
|
try:
|
|
342
357
|
self.success = self.score >= self.threshold
|
|
343
|
-
except:
|
|
358
|
+
except TypeError:
|
|
344
359
|
self.success = False
|
|
345
360
|
return self.success
|
|
346
361
|
|
|
@@ -3,8 +3,16 @@ import textwrap
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class GoalAccuracyTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
6
14
|
@staticmethod
|
|
7
|
-
def get_accuracy_score(task, steps_taken):
|
|
15
|
+
def get_accuracy_score(task, steps_taken, multimodal: bool = False):
|
|
8
16
|
return textwrap.dedent(
|
|
9
17
|
f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
|
|
10
18
|
|
|
@@ -36,6 +44,8 @@ class GoalAccuracyTemplate:
|
|
|
36
44
|
- When uncertain, assume the goal was **not achieved**.
|
|
37
45
|
- The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
|
|
38
46
|
|
|
47
|
+
{GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
|
|
48
|
+
|
|
39
49
|
SCORING GUIDE:
|
|
40
50
|
|
|
41
51
|
- **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
|
|
@@ -102,7 +112,7 @@ class GoalAccuracyTemplate:
|
|
|
102
112
|
)
|
|
103
113
|
|
|
104
114
|
@staticmethod
|
|
105
|
-
def get_plan_evaluation_score(task, steps_taken):
|
|
115
|
+
def get_plan_evaluation_score(task, steps_taken, multimodal: bool = False):
|
|
106
116
|
return textwrap.dedent(
|
|
107
117
|
f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
|
|
108
118
|
|
|
@@ -132,6 +142,8 @@ class GoalAccuracyTemplate:
|
|
|
132
142
|
- Tool use should be coherent within the plan, not ad hoc or speculative.
|
|
133
143
|
- This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
|
|
134
144
|
|
|
145
|
+
{GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
|
|
146
|
+
|
|
135
147
|
SCORING GUIDE:
|
|
136
148
|
|
|
137
149
|
- **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
|
|
@@ -188,7 +200,11 @@ class GoalAccuracyTemplate:
|
|
|
188
200
|
|
|
189
201
|
@staticmethod
|
|
190
202
|
def get_final_reason(
|
|
191
|
-
final_score,
|
|
203
|
+
final_score,
|
|
204
|
+
threshold,
|
|
205
|
+
goal_evaluations,
|
|
206
|
+
plan_evalautions,
|
|
207
|
+
multimodal: bool = False,
|
|
192
208
|
):
|
|
193
209
|
return textwrap.dedent(
|
|
194
210
|
f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
|
|
@@ -213,6 +229,8 @@ class GoalAccuracyTemplate:
|
|
|
213
229
|
- If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
|
|
214
230
|
- Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
|
|
215
231
|
|
|
232
|
+
{GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
|
|
233
|
+
|
|
216
234
|
---
|
|
217
235
|
|
|
218
236
|
FORMAT:
|
|
@@ -8,14 +8,19 @@ from deepeval.metrics import BaseMetric
|
|
|
8
8
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
trimAndLoadJson,
|
|
12
11
|
check_llm_test_case_params,
|
|
13
12
|
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
16
|
from deepeval.metrics.hallucination.template import HallucinationTemplate
|
|
16
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
|
-
from deepeval.metrics.hallucination.schema import
|
|
19
|
+
from deepeval.metrics.hallucination.schema import (
|
|
20
|
+
HallucinationVerdict,
|
|
21
|
+
Verdicts,
|
|
22
|
+
HallucinationScoreReason,
|
|
23
|
+
)
|
|
19
24
|
from deepeval.metrics.api import metric_data_manager
|
|
20
25
|
|
|
21
26
|
|
|
@@ -55,7 +60,16 @@ class HallucinationMetric(BaseMetric):
|
|
|
55
60
|
_log_metric_to_confident: bool = True,
|
|
56
61
|
) -> float:
|
|
57
62
|
|
|
58
|
-
|
|
63
|
+
multimodal = test_case.multimodal
|
|
64
|
+
check_llm_test_case_params(
|
|
65
|
+
test_case,
|
|
66
|
+
self._required_params,
|
|
67
|
+
None,
|
|
68
|
+
None,
|
|
69
|
+
self,
|
|
70
|
+
self.model,
|
|
71
|
+
multimodal,
|
|
72
|
+
)
|
|
59
73
|
|
|
60
74
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
61
75
|
with metric_progress_indicator(
|
|
@@ -102,7 +116,16 @@ class HallucinationMetric(BaseMetric):
|
|
|
102
116
|
_log_metric_to_confident: bool = True,
|
|
103
117
|
) -> float:
|
|
104
118
|
|
|
105
|
-
|
|
119
|
+
multimodal = test_case.multimodal
|
|
120
|
+
check_llm_test_case_params(
|
|
121
|
+
test_case,
|
|
122
|
+
self._required_params,
|
|
123
|
+
None,
|
|
124
|
+
None,
|
|
125
|
+
self,
|
|
126
|
+
self.model,
|
|
127
|
+
multimodal,
|
|
128
|
+
)
|
|
106
129
|
|
|
107
130
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
108
131
|
with metric_progress_indicator(
|
|
@@ -150,22 +173,13 @@ class HallucinationMetric(BaseMetric):
|
|
|
150
173
|
score=format(self.score, ".2f"),
|
|
151
174
|
)
|
|
152
175
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
res: HallucinationScoreReason = await self.model.a_generate(
|
|
162
|
-
prompt, schema=HallucinationScoreReason
|
|
163
|
-
)
|
|
164
|
-
return res.reason
|
|
165
|
-
except TypeError:
|
|
166
|
-
res = await self.model.a_generate(prompt)
|
|
167
|
-
data = trimAndLoadJson(res, self)
|
|
168
|
-
return data["reason"]
|
|
176
|
+
return await a_generate_with_schema_and_extract(
|
|
177
|
+
metric=self,
|
|
178
|
+
prompt=prompt,
|
|
179
|
+
schema_cls=HallucinationScoreReason,
|
|
180
|
+
extract_schema=lambda s: s.reason,
|
|
181
|
+
extract_json=lambda data: data["reason"],
|
|
182
|
+
)
|
|
169
183
|
|
|
170
184
|
def _generate_reason(self):
|
|
171
185
|
if self.include_reason is False:
|
|
@@ -185,74 +199,45 @@ class HallucinationMetric(BaseMetric):
|
|
|
185
199
|
score=format(self.score, ".2f"),
|
|
186
200
|
)
|
|
187
201
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
try:
|
|
196
|
-
res: HallucinationScoreReason = self.model.generate(
|
|
197
|
-
prompt, schema=HallucinationScoreReason
|
|
198
|
-
)
|
|
199
|
-
return res.reason
|
|
200
|
-
except TypeError:
|
|
201
|
-
res = self.model.generate(prompt)
|
|
202
|
-
data = trimAndLoadJson(res, self)
|
|
203
|
-
return data["reason"]
|
|
202
|
+
return generate_with_schema_and_extract(
|
|
203
|
+
metric=self,
|
|
204
|
+
prompt=prompt,
|
|
205
|
+
schema_cls=HallucinationScoreReason,
|
|
206
|
+
extract_schema=lambda s: s.reason,
|
|
207
|
+
extract_json=lambda data: data["reason"],
|
|
208
|
+
)
|
|
204
209
|
|
|
205
210
|
async def _a_generate_verdicts(
|
|
206
211
|
self, actual_output: str, contexts: List[str]
|
|
207
212
|
) -> List[HallucinationVerdict]:
|
|
208
|
-
verdicts: List[HallucinationVerdict] = []
|
|
209
213
|
prompt = self.evaluation_template.generate_verdicts(
|
|
210
214
|
actual_output=actual_output, contexts=contexts
|
|
211
215
|
)
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
)
|
|
222
|
-
verdicts = [item for item in res.verdicts]
|
|
223
|
-
return verdicts
|
|
224
|
-
except TypeError:
|
|
225
|
-
res = await self.model.a_generate(prompt)
|
|
226
|
-
data = trimAndLoadJson(res, self)
|
|
227
|
-
verdicts = [
|
|
228
|
-
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
229
|
-
]
|
|
230
|
-
return verdicts
|
|
216
|
+
return await a_generate_with_schema_and_extract(
|
|
217
|
+
metric=self,
|
|
218
|
+
prompt=prompt,
|
|
219
|
+
schema_cls=Verdicts,
|
|
220
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
221
|
+
extract_json=lambda data: [
|
|
222
|
+
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
223
|
+
],
|
|
224
|
+
)
|
|
231
225
|
|
|
232
226
|
def _generate_verdicts(
|
|
233
227
|
self, actual_output: str, contexts: List[str]
|
|
234
228
|
) -> List[HallucinationVerdict]:
|
|
235
|
-
verdicts: List[HallucinationVerdict] = []
|
|
236
229
|
prompt = self.evaluation_template.generate_verdicts(
|
|
237
230
|
actual_output=actual_output, contexts=contexts
|
|
238
231
|
)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
return verdicts
|
|
249
|
-
except TypeError:
|
|
250
|
-
res = self.model.generate(prompt)
|
|
251
|
-
data = trimAndLoadJson(res, self)
|
|
252
|
-
verdicts = [
|
|
253
|
-
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
254
|
-
]
|
|
255
|
-
return verdicts
|
|
232
|
+
return generate_with_schema_and_extract(
|
|
233
|
+
metric=self,
|
|
234
|
+
prompt=prompt,
|
|
235
|
+
schema_cls=Verdicts,
|
|
236
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
237
|
+
extract_json=lambda data: [
|
|
238
|
+
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
239
|
+
],
|
|
240
|
+
)
|
|
256
241
|
|
|
257
242
|
def _calculate_score(self) -> float:
|
|
258
243
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -273,7 +258,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
273
258
|
else:
|
|
274
259
|
try:
|
|
275
260
|
self.success = self.score <= self.threshold
|
|
276
|
-
except:
|
|
261
|
+
except TypeError:
|
|
277
262
|
self.success = False
|
|
278
263
|
return self.success
|
|
279
264
|
|
|
@@ -2,9 +2,20 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class HallucinationTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_verdicts(actual_output: str, contexts: List[str]):
|
|
7
15
|
return f"""For each context in contexts, which is a list of strings, please generate a list of JSON objects to indicate whether the given 'actual output' agrees with EACH context. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
16
|
+
|
|
17
|
+
{HallucinationTemplate.multimodal_rules}
|
|
18
|
+
|
|
8
19
|
The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the given text agrees with the context.
|
|
9
20
|
The 'reason' is the reason for the verdict. When the answer is 'no', try to provide a correction in the reason.
|
|
10
21
|
|
|
@@ -46,6 +57,8 @@ JSON:
|
|
|
46
57
|
):
|
|
47
58
|
return f"""Given a list of factual alignments and contradictions, which highlights alignment/contradictions between the `actual output` and `contexts, use it to provide a reason for the hallucination score in a CONCISELY. Note that The hallucination score ranges from 0 - 1, and the lower the better.
|
|
48
59
|
|
|
60
|
+
{HallucinationTemplate.multimodal_rules}
|
|
61
|
+
|
|
49
62
|
**
|
|
50
63
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
51
64
|
Example JSON:
|