deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
import textwrap
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TurnFaithfulnessTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
- When evaluating claims, compare them to BOTH textual and visual evidence.
|
|
13
|
+
- If the claim references something not clearly visible, respond with 'idk'.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def generate_claims(
|
|
18
|
+
input: str, assistant_output: str, multimodal: bool = False
|
|
19
|
+
):
|
|
20
|
+
return textwrap.dedent(
|
|
21
|
+
f"""
|
|
22
|
+
Extract every factual-sounding claim asserted in the ASSISTANT'S OUTPUT.
|
|
23
|
+
|
|
24
|
+
A claim is any statement presented as fact, even if it is incorrect, vague, implied, or unverifiable.
|
|
25
|
+
|
|
26
|
+
RULES:
|
|
27
|
+
- Use ONLY the assistant's output as the source of claims.
|
|
28
|
+
- Use the user's preceding message ONLY to resolve pronouns or references, not as factual evidence.
|
|
29
|
+
- Extract claims exactly as stated without rewriting, summarizing, merging, or omitting details.
|
|
30
|
+
- If a sentence contains multiple factual assertions, extract each as a separate claim.
|
|
31
|
+
- Claims may involve text or images if multimodal.
|
|
32
|
+
- Do NOT add, infer, or transform information.
|
|
33
|
+
|
|
34
|
+
{TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
|
|
35
|
+
|
|
36
|
+
Output MUST be ONLY valid JSON:
|
|
37
|
+
|
|
38
|
+
{{
|
|
39
|
+
"claims": ["claim 1", "claim 2", ...]
|
|
40
|
+
}}
|
|
41
|
+
|
|
42
|
+
USER MESSAGE:
|
|
43
|
+
{input}
|
|
44
|
+
|
|
45
|
+
ASSISTANT OUTPUT:
|
|
46
|
+
{assistant_output}
|
|
47
|
+
|
|
48
|
+
JSON:
|
|
49
|
+
"""
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def generate_truths(
|
|
54
|
+
reference_context: str,
|
|
55
|
+
extraction_limit: Optional[int],
|
|
56
|
+
multimodal: bool = False,
|
|
57
|
+
):
|
|
58
|
+
if extraction_limit is None:
|
|
59
|
+
limit_description = "factual, explicit truths"
|
|
60
|
+
elif extraction_limit == 1:
|
|
61
|
+
limit_description = "one factual, explicit truth"
|
|
62
|
+
else:
|
|
63
|
+
limit_description = f"{extraction_limit} factual, explicit truths"
|
|
64
|
+
|
|
65
|
+
return textwrap.dedent(
|
|
66
|
+
f"""
|
|
67
|
+
Extract {limit_description} from the REFERENCE CONTEXT.
|
|
68
|
+
|
|
69
|
+
RULES:
|
|
70
|
+
- Truths must be atomic, explicit factual statements.
|
|
71
|
+
- Do not summarize or combine multiple facts.
|
|
72
|
+
- Select truths based on reading order, not 'importance'.
|
|
73
|
+
- Do not infer or expand beyond what is explicitly stated.
|
|
74
|
+
- Keep each truth minimal but complete.
|
|
75
|
+
- Treat images as factual evidence if multimodal, using only clearly visible information.
|
|
76
|
+
|
|
77
|
+
{TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
|
|
78
|
+
|
|
79
|
+
Output MUST be ONLY valid JSON:
|
|
80
|
+
|
|
81
|
+
{{
|
|
82
|
+
"truths": ["truth 1", "truth 2", ...]
|
|
83
|
+
}}
|
|
84
|
+
|
|
85
|
+
REFERENCE CONTEXT:
|
|
86
|
+
{reference_context}
|
|
87
|
+
|
|
88
|
+
JSON:
|
|
89
|
+
"""
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def generate_verdicts(
|
|
94
|
+
claims: List[str], reference_context: str, multimodal: bool = False
|
|
95
|
+
):
|
|
96
|
+
return textwrap.dedent(
|
|
97
|
+
f"""
|
|
98
|
+
For each claim, determine whether it is supported, contradicted, or not addressed by the reference context.
|
|
99
|
+
|
|
100
|
+
DEFINITIONS:
|
|
101
|
+
- "yes" = The claim is directly supported by at least one truth.
|
|
102
|
+
- "no" = The claim directly contradicts at least one truth.
|
|
103
|
+
- "idk" = The context does not confirm or contradict the claim.
|
|
104
|
+
|
|
105
|
+
RULES:
|
|
106
|
+
- One verdict per claim, in the same order.
|
|
107
|
+
- Do NOT use prior knowledge.
|
|
108
|
+
- Only use the explicit truths provided.
|
|
109
|
+
- A "yes" verdict must not include a reason.
|
|
110
|
+
- A "no" or "idk" verdict must include a concise reason that quotes or paraphrases only the truths.
|
|
111
|
+
- If a claim references an image and the visibility is unclear or ambiguous, use "idk".
|
|
112
|
+
- Do not create new facts or explanations.
|
|
113
|
+
|
|
114
|
+
{TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
|
|
115
|
+
|
|
116
|
+
Output MUST be ONLY valid JSON:
|
|
117
|
+
|
|
118
|
+
{{
|
|
119
|
+
"verdicts": [
|
|
120
|
+
{{
|
|
121
|
+
"verdict": "yes"
|
|
122
|
+
}},
|
|
123
|
+
{{
|
|
124
|
+
"verdict": "no",
|
|
125
|
+
"reason": "<explanation>"
|
|
126
|
+
}},
|
|
127
|
+
{{
|
|
128
|
+
"verdict": "idk",
|
|
129
|
+
"reason": "<explanation>"
|
|
130
|
+
}}
|
|
131
|
+
]
|
|
132
|
+
}}
|
|
133
|
+
|
|
134
|
+
REFERENCE CONTEXT:
|
|
135
|
+
{reference_context}
|
|
136
|
+
|
|
137
|
+
CLAIMS:
|
|
138
|
+
{claims}
|
|
139
|
+
|
|
140
|
+
JSON:
|
|
141
|
+
"""
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def generate_reason(
|
|
146
|
+
score: float, contradictions: List[str], multimodal: bool = False
|
|
147
|
+
):
|
|
148
|
+
return textwrap.dedent(
|
|
149
|
+
f"""
|
|
150
|
+
Below is a list of contradictions extracted from verdicts. Write a concise justification of the score.
|
|
151
|
+
|
|
152
|
+
RULES:
|
|
153
|
+
- If contradictions exist, summarize them in 1-3 sentences.
|
|
154
|
+
- If no contradictions exist, respond:
|
|
155
|
+
{{
|
|
156
|
+
"reason": "No contradictions were found."
|
|
157
|
+
}}
|
|
158
|
+
- The summary must reference only the contradictions listed.
|
|
159
|
+
- Tone must be neutral and concise.
|
|
160
|
+
- No external knowledge may be used.
|
|
161
|
+
|
|
162
|
+
{TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
|
|
163
|
+
|
|
164
|
+
Output MUST be ONLY valid JSON:
|
|
165
|
+
|
|
166
|
+
{{
|
|
167
|
+
"reason": "<summary>"
|
|
168
|
+
}}
|
|
169
|
+
|
|
170
|
+
FAITHFULNESS SCORE:
|
|
171
|
+
{score}
|
|
172
|
+
|
|
173
|
+
CONTRADICTIONS:
|
|
174
|
+
{contradictions}
|
|
175
|
+
|
|
176
|
+
JSON:
|
|
177
|
+
"""
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def generate_final_reason(
|
|
182
|
+
final_score: float, success: bool, reasons: List[str]
|
|
183
|
+
):
|
|
184
|
+
return textwrap.dedent(
|
|
185
|
+
f"""You are an AI evaluator producing a single final explanation for the TurnFaithfulnessMetric result.
|
|
186
|
+
|
|
187
|
+
Context:
|
|
188
|
+
This metric evaluates conversational faithfulness by extracting truths from retrieval context, extracting claims from the assistant's output, and generating verdicts that compare each claim against the truths. Each interaction yields a reason indicating why a verdict failed or succeeded. You are given all those reasons.
|
|
189
|
+
|
|
190
|
+
Inputs:
|
|
191
|
+
- final_score: the averaged score across all interactions.
|
|
192
|
+
- success: whether the metric passed or failed
|
|
193
|
+
- reasons: a list of textual reasons generated from individual verdicts.
|
|
194
|
+
|
|
195
|
+
Instructions:
|
|
196
|
+
1. Read all reasons and synthesize them into one unified explanation.
|
|
197
|
+
2. Describe patterns of claim-truth mismatches, contradictions, hallucinations, unsupported statements, or image-related errors if present.
|
|
198
|
+
3. Do not repeat every reason; merge them into a concise, coherent narrative.
|
|
199
|
+
5. If the metric failed, state the dominant failure modes. If it passed, state why the model's claims aligned with truths.
|
|
200
|
+
6. Output a single paragraph with no lists, no bullets, no markup.
|
|
201
|
+
|
|
202
|
+
Output:
|
|
203
|
+
A single paragraph explaining the final outcome.
|
|
204
|
+
|
|
205
|
+
Here's the inputs:
|
|
206
|
+
|
|
207
|
+
Final Score: {final_score}
|
|
208
|
+
|
|
209
|
+
Reasons:
|
|
210
|
+
{reasons}
|
|
211
|
+
|
|
212
|
+
Success: {success}
|
|
213
|
+
|
|
214
|
+
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
215
|
+
|
|
216
|
+
The final reason:
|
|
217
|
+
"""
|
|
218
|
+
)
|