deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,110 +1,206 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
import textwrap
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
class AnswerRelevancyTemplate:
|
|
5
6
|
@staticmethod
|
|
6
|
-
def generate_statements(actual_output: str):
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
7
|
+
def generate_statements(actual_output: str, multimodal: bool = False):
|
|
8
|
+
multimodal_instruction = ""
|
|
9
|
+
example_text = ""
|
|
10
|
+
example_json = ""
|
|
11
|
+
|
|
12
|
+
if multimodal:
|
|
13
|
+
multimodal_instruction = " The text may contain images as well."
|
|
14
|
+
example_text = "Shoes. The shoes can be refunded at no extra cost. Thanks for asking the question!"
|
|
15
|
+
example_json = textwrap.dedent(
|
|
16
|
+
"""
|
|
17
|
+
{{
|
|
18
|
+
"statements": ["Shoes.", "Shoes can be refunded at no extra cost", "Thanks for asking the question!"]
|
|
19
|
+
}}
|
|
20
|
+
"""
|
|
21
|
+
)
|
|
22
|
+
else:
|
|
23
|
+
example_text = "Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we've added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support."
|
|
24
|
+
example_json = textwrap.dedent(
|
|
25
|
+
"""
|
|
26
|
+
{{
|
|
27
|
+
"statements": [
|
|
28
|
+
"The new laptop model has a high-resolution Retina display.",
|
|
29
|
+
"It includes a fast-charging battery with up to 12 hours of usage.",
|
|
30
|
+
"Security features include fingerprint authentication and an encrypted SSD.",
|
|
31
|
+
"Every purchase comes with a one-year warranty.",
|
|
32
|
+
"24/7 customer support is included."
|
|
33
|
+
]
|
|
34
|
+
}}
|
|
35
|
+
"""
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
coherence_note = (
|
|
39
|
+
""
|
|
40
|
+
if multimodal
|
|
41
|
+
else " Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return textwrap.dedent(
|
|
45
|
+
f"""Given the text, breakdown and generate a list of statements presented.{coherence_note}{multimodal_instruction}
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
Example text:
|
|
49
|
+
{example_text}
|
|
50
|
+
|
|
51
|
+
{example_json}
|
|
52
|
+
===== END OF EXAMPLE ======
|
|
53
|
+
|
|
54
|
+
**
|
|
55
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
56
|
+
**
|
|
57
|
+
|
|
58
|
+
Text:
|
|
59
|
+
{actual_output}
|
|
60
|
+
|
|
61
|
+
JSON:
|
|
62
|
+
"""
|
|
63
|
+
)
|
|
33
64
|
|
|
34
65
|
@staticmethod
|
|
35
|
-
def generate_verdicts(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
}}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"""
|
|
66
|
+
def generate_verdicts(
|
|
67
|
+
input: str, statements: str, multimodal: bool = False
|
|
68
|
+
):
|
|
69
|
+
content_type = (
|
|
70
|
+
"statements (which can contain images)"
|
|
71
|
+
if multimodal
|
|
72
|
+
else "list of statements"
|
|
73
|
+
)
|
|
74
|
+
statement_or_image = "statement or image" if multimodal else "statement"
|
|
75
|
+
|
|
76
|
+
format_instruction = textwrap.dedent(
|
|
77
|
+
"""
|
|
78
|
+
Expected JSON format:
|
|
79
|
+
{{
|
|
80
|
+
"verdicts": [
|
|
81
|
+
{{
|
|
82
|
+
"verdict": "yes"
|
|
83
|
+
}},
|
|
84
|
+
{{
|
|
85
|
+
"reason": <explanation_for_irrelevance>,
|
|
86
|
+
"verdict": "no"
|
|
87
|
+
}},
|
|
88
|
+
{{
|
|
89
|
+
"reason": <explanation_for_ambiguity>,
|
|
90
|
+
"verdict": "idk"
|
|
91
|
+
}}
|
|
92
|
+
]
|
|
93
|
+
}}
|
|
94
|
+
"""
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
example_section = ""
|
|
98
|
+
if multimodal:
|
|
99
|
+
example_section = textwrap.dedent(
|
|
100
|
+
"""
|
|
101
|
+
Example input: What should I do if there is an earthquake?
|
|
102
|
+
Example statements: ["Shoes.", "Thanks for asking the question!", "Is there anything else I can help you with?", "Duck and hide"]
|
|
103
|
+
Example JSON:
|
|
104
|
+
{{
|
|
105
|
+
"verdicts": [
|
|
106
|
+
{{
|
|
107
|
+
"reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
|
|
108
|
+
"verdict": "no"
|
|
109
|
+
}},
|
|
110
|
+
{{
|
|
111
|
+
"reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
|
|
112
|
+
"verdict": "idk"
|
|
113
|
+
}},
|
|
114
|
+
{{
|
|
115
|
+
"reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
|
|
116
|
+
"verdict": "idk"
|
|
117
|
+
}},
|
|
118
|
+
{{
|
|
119
|
+
"verdict": "yes"
|
|
120
|
+
}}
|
|
121
|
+
]
|
|
122
|
+
}}
|
|
123
|
+
"""
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
guidelines = ""
|
|
127
|
+
if multimodal:
|
|
128
|
+
guidelines = textwrap.dedent(
|
|
129
|
+
f"""
|
|
130
|
+
Since you are going to generate a verdict for each statement and image, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
|
|
131
|
+
"""
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
guidelines = textwrap.dedent(
|
|
135
|
+
f"""
|
|
136
|
+
Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
|
|
137
|
+
'verdict' must be STRICTLY 'yes', 'no', or 'idk':
|
|
138
|
+
- 'yes': statement is relevant to addressing the input
|
|
139
|
+
- 'no': statement is irrelevant to the input
|
|
140
|
+
- 'idk': statement is ambiguous (not directly relevant but could be supporting information)
|
|
141
|
+
Provide 'reason' ONLY for 'no' or 'idk' verdicts.
|
|
142
|
+
"""
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return textwrap.dedent(
|
|
146
|
+
f"""For the provided {content_type}, determine whether each {statement_or_image} is relevant to address the input.
|
|
147
|
+
{"Please generate a list of JSON with two keys: `verdict` and `reason`." if multimodal else "Generate JSON objects with 'verdict' and 'reason' fields."}
|
|
148
|
+
The 'verdict' {"key " if multimodal else ''}should {"STRICTLY be either a 'yes', 'idk' or 'no'" if multimodal else "be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information)"}. {"Answer 'yes' if the " + statement_or_image + ' is relevant to addressing the original input, no if the ' + statement_or_image + ' is irrelevant, and "idk" if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).' if multimodal else ""}
|
|
149
|
+
{"The 'reason' is the reason for the verdict.' if multimodal else '"}
|
|
150
|
+
Provide 'reason' ONLY for 'no' or 'idk' verdicts.
|
|
151
|
+
The {"provided statements are statements and images' if multimodal else 'statements are from an AI's actual output"} generated in the actual output.
|
|
152
|
+
|
|
153
|
+
**
|
|
154
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
155
|
+
|
|
156
|
+
{format_instruction if not multimodal else ''}
|
|
157
|
+
{example_section}
|
|
158
|
+
{guidelines}
|
|
159
|
+
**
|
|
160
|
+
|
|
161
|
+
Input:
|
|
162
|
+
{input}
|
|
163
|
+
|
|
164
|
+
Statements:
|
|
165
|
+
{statements}
|
|
166
|
+
|
|
167
|
+
JSON:
|
|
168
|
+
"""
|
|
169
|
+
)
|
|
78
170
|
|
|
79
171
|
@staticmethod
|
|
80
172
|
def generate_reason(
|
|
81
|
-
irrelevant_statements: List[str],
|
|
173
|
+
irrelevant_statements: List[str],
|
|
174
|
+
input: str,
|
|
175
|
+
score: float,
|
|
176
|
+
multimodal: bool = False,
|
|
82
177
|
):
|
|
83
|
-
return
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
178
|
+
return textwrap.dedent(
|
|
179
|
+
f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
|
|
180
|
+
The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
|
|
181
|
+
If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
87
182
|
|
|
88
|
-
**
|
|
89
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
90
183
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
{{
|
|
94
|
-
"reason": "The score is <answer_relevancy_score> because <your_reason>."
|
|
95
|
-
}}
|
|
96
|
-
===== END OF EXAMPLE ======
|
|
97
|
-
**
|
|
184
|
+
**
|
|
185
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
98
186
|
|
|
187
|
+
{"Example:' if not multimodal else '"}
|
|
188
|
+
Example JSON:
|
|
189
|
+
{{
|
|
190
|
+
"reason": "The score is <answer_relevancy_score> because <your_reason>."
|
|
191
|
+
}}
|
|
192
|
+
{"===== END OF EXAMPLE ======' if not multimodal else '"}
|
|
193
|
+
**
|
|
99
194
|
|
|
100
|
-
Answer Relevancy Score:
|
|
101
|
-
{score}
|
|
195
|
+
Answer Relevancy Score:
|
|
196
|
+
{score}
|
|
102
197
|
|
|
103
|
-
Reasons why the score can't be higher based on irrelevant statements in the actual output:
|
|
104
|
-
{irrelevant_statements}
|
|
198
|
+
Reasons why the score can't be higher based on irrelevant statements in the actual output:
|
|
199
|
+
{irrelevant_statements}
|
|
105
200
|
|
|
106
|
-
Input:
|
|
107
|
-
{input}
|
|
201
|
+
Input:
|
|
202
|
+
{input}
|
|
108
203
|
|
|
109
|
-
JSON:
|
|
110
|
-
"""
|
|
204
|
+
JSON:
|
|
205
|
+
"""
|
|
206
|
+
)
|
|
@@ -70,8 +70,8 @@ class ArgumentCorrectnessTemplate:
|
|
|
70
70
|
"verdict": "yes"
|
|
71
71
|
}},
|
|
72
72
|
{{
|
|
73
|
-
"
|
|
74
|
-
"
|
|
73
|
+
"reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023.",
|
|
74
|
+
"verdict": "no"
|
|
75
75
|
}}
|
|
76
76
|
]
|
|
77
77
|
}}
|
deepeval/metrics/base_metric.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import Optional, Dict, List
|
|
|
4
4
|
from deepeval.test_case import (
|
|
5
5
|
LLMTestCase,
|
|
6
6
|
ConversationalTestCase,
|
|
7
|
-
MLLMTestCase,
|
|
8
7
|
LLMTestCaseParams,
|
|
9
8
|
ArenaTestCase,
|
|
10
9
|
)
|
|
@@ -113,13 +112,11 @@ class BaseMultimodalMetric:
|
|
|
113
112
|
self._threshold = value
|
|
114
113
|
|
|
115
114
|
@abstractmethod
|
|
116
|
-
def measure(self, test_case:
|
|
115
|
+
def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
|
|
117
116
|
raise NotImplementedError
|
|
118
117
|
|
|
119
118
|
@abstractmethod
|
|
120
|
-
async def a_measure(
|
|
121
|
-
self, test_case: MLLMTestCase, *args, **kwargs
|
|
122
|
-
) -> float:
|
|
119
|
+
async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
|
|
123
120
|
raise NotImplementedError(
|
|
124
121
|
f"Async execution for {self.__class__.__name__} not supported yet. Please set 'async_mode' to 'False'."
|
|
125
122
|
)
|
|
@@ -64,15 +64,15 @@ Example JSON:
|
|
|
64
64
|
{{
|
|
65
65
|
"verdicts": [
|
|
66
66
|
{{
|
|
67
|
-
"
|
|
68
|
-
"
|
|
67
|
+
"reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement.",
|
|
68
|
+
"verdict": "yes"
|
|
69
69
|
}},
|
|
70
70
|
{{
|
|
71
71
|
"verdict": "no"
|
|
72
72
|
}},
|
|
73
73
|
{{
|
|
74
74
|
"verdict": "no"
|
|
75
|
-
}}
|
|
75
|
+
}}
|
|
76
76
|
]
|
|
77
77
|
}}
|
|
78
78
|
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
from typing import Optional, List, Type, Union
|
|
2
2
|
|
|
3
|
-
from deepeval.utils import
|
|
3
|
+
from deepeval.utils import (
|
|
4
|
+
get_or_create_event_loop,
|
|
5
|
+
prettify_list,
|
|
6
|
+
)
|
|
4
7
|
from deepeval.metrics.utils import (
|
|
5
8
|
construct_verbose_logs,
|
|
6
9
|
trimAndLoadJson,
|
|
7
10
|
check_llm_test_case_params,
|
|
11
|
+
check_mllm_test_case_params,
|
|
8
12
|
initialize_model,
|
|
9
13
|
)
|
|
10
14
|
from deepeval.test_case import (
|
|
@@ -56,7 +60,15 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
56
60
|
_in_component: bool = False,
|
|
57
61
|
_log_metric_to_confident: bool = True,
|
|
58
62
|
) -> float:
|
|
59
|
-
|
|
63
|
+
|
|
64
|
+
multimodal = test_case.multimodal
|
|
65
|
+
|
|
66
|
+
if multimodal:
|
|
67
|
+
check_mllm_test_case_params(
|
|
68
|
+
test_case, self._required_params, None, None, self, self.model
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
60
72
|
|
|
61
73
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
62
74
|
with metric_progress_indicator(
|
|
@@ -73,15 +85,20 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
73
85
|
)
|
|
74
86
|
)
|
|
75
87
|
else:
|
|
88
|
+
input = test_case.input
|
|
89
|
+
expected_output = test_case.expected_output
|
|
90
|
+
retrieval_context = test_case.retrieval_context
|
|
91
|
+
|
|
76
92
|
self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
|
|
77
93
|
self._generate_verdicts(
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
94
|
+
input,
|
|
95
|
+
expected_output,
|
|
96
|
+
retrieval_context,
|
|
97
|
+
multimodal,
|
|
81
98
|
)
|
|
82
99
|
)
|
|
83
100
|
self.score = self._calculate_score()
|
|
84
|
-
self.reason = self._generate_reason(
|
|
101
|
+
self.reason = self._generate_reason(input, multimodal)
|
|
85
102
|
self.success = self.score >= self.threshold
|
|
86
103
|
self.verbose_logs = construct_verbose_logs(
|
|
87
104
|
self,
|
|
@@ -104,7 +121,14 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
104
121
|
_log_metric_to_confident: bool = True,
|
|
105
122
|
) -> float:
|
|
106
123
|
|
|
107
|
-
|
|
124
|
+
multimodal = test_case.multimodal
|
|
125
|
+
|
|
126
|
+
if multimodal:
|
|
127
|
+
check_mllm_test_case_params(
|
|
128
|
+
test_case, self._required_params, None, None, self, self.model
|
|
129
|
+
)
|
|
130
|
+
else:
|
|
131
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
108
132
|
|
|
109
133
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
110
134
|
with metric_progress_indicator(
|
|
@@ -113,15 +137,17 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
113
137
|
_show_indicator=_show_indicator,
|
|
114
138
|
_in_component=_in_component,
|
|
115
139
|
):
|
|
140
|
+
input = test_case.input
|
|
141
|
+
expected_output = test_case.expected_output
|
|
142
|
+
retrieval_context = test_case.retrieval_context
|
|
143
|
+
|
|
116
144
|
self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
|
|
117
145
|
await self._a_generate_verdicts(
|
|
118
|
-
|
|
119
|
-
test_case.expected_output,
|
|
120
|
-
test_case.retrieval_context,
|
|
146
|
+
input, expected_output, retrieval_context, multimodal
|
|
121
147
|
)
|
|
122
148
|
)
|
|
123
149
|
self.score = self._calculate_score()
|
|
124
|
-
self.reason = await self._a_generate_reason(
|
|
150
|
+
self.reason = await self._a_generate_reason(input, multimodal)
|
|
125
151
|
self.success = self.score >= self.threshold
|
|
126
152
|
self.verbose_logs = construct_verbose_logs(
|
|
127
153
|
self,
|
|
@@ -136,7 +162,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
136
162
|
)
|
|
137
163
|
return self.score
|
|
138
164
|
|
|
139
|
-
async def _a_generate_reason(self, input: str):
|
|
165
|
+
async def _a_generate_reason(self, input: str, multimodal: bool):
|
|
140
166
|
if self.include_reason is False:
|
|
141
167
|
return None
|
|
142
168
|
|
|
@@ -148,6 +174,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
148
174
|
input=input,
|
|
149
175
|
verdicts=retrieval_contexts_verdicts,
|
|
150
176
|
score=format(self.score, ".2f"),
|
|
177
|
+
multimodal=multimodal,
|
|
151
178
|
)
|
|
152
179
|
|
|
153
180
|
if self.using_native_model:
|
|
@@ -169,7 +196,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
169
196
|
data = trimAndLoadJson(res, self)
|
|
170
197
|
return data["reason"]
|
|
171
198
|
|
|
172
|
-
def _generate_reason(self, input: str):
|
|
199
|
+
def _generate_reason(self, input: str, multimodal: bool):
|
|
173
200
|
if self.include_reason is False:
|
|
174
201
|
return None
|
|
175
202
|
|
|
@@ -181,6 +208,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
181
208
|
input=input,
|
|
182
209
|
verdicts=retrieval_contexts_verdicts,
|
|
183
210
|
score=format(self.score, ".2f"),
|
|
211
|
+
multimodal=multimodal,
|
|
184
212
|
)
|
|
185
213
|
|
|
186
214
|
if self.using_native_model:
|
|
@@ -203,12 +231,17 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
203
231
|
return data["reason"]
|
|
204
232
|
|
|
205
233
|
async def _a_generate_verdicts(
|
|
206
|
-
self,
|
|
234
|
+
self,
|
|
235
|
+
input: str,
|
|
236
|
+
expected_output: str,
|
|
237
|
+
retrieval_context: List[str],
|
|
238
|
+
multimodal: bool,
|
|
207
239
|
) -> List[cpschema.ContextualPrecisionVerdict]:
|
|
208
240
|
prompt = self.evaluation_template.generate_verdicts(
|
|
209
241
|
input=input,
|
|
210
242
|
expected_output=expected_output,
|
|
211
243
|
retrieval_context=retrieval_context,
|
|
244
|
+
multimodal=multimodal,
|
|
212
245
|
)
|
|
213
246
|
if self.using_native_model:
|
|
214
247
|
res, cost = await self.model.a_generate(
|
|
@@ -234,12 +267,17 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
234
267
|
return verdicts
|
|
235
268
|
|
|
236
269
|
def _generate_verdicts(
|
|
237
|
-
self,
|
|
270
|
+
self,
|
|
271
|
+
input: str,
|
|
272
|
+
expected_output: str,
|
|
273
|
+
retrieval_context: List[str],
|
|
274
|
+
multimodal: bool,
|
|
238
275
|
) -> List[cpschema.ContextualPrecisionVerdict]:
|
|
239
276
|
prompt = self.evaluation_template.generate_verdicts(
|
|
240
277
|
input=input,
|
|
241
278
|
expected_output=expected_output,
|
|
242
279
|
retrieval_context=retrieval_context,
|
|
280
|
+
multimodal=multimodal,
|
|
243
281
|
)
|
|
244
282
|
if self.using_native_model:
|
|
245
283
|
res, cost = self.model.generate(prompt, schema=cpschema.Verdicts)
|