deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,218 @@
1
+ from typing import Optional, List
2
+ import textwrap
3
+
4
+
5
+ class TurnFaithfulnessTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ - When evaluating claims, compare them to BOTH textual and visual evidence.
13
+ - If the claim references something not clearly visible, respond with 'idk'.
14
+ """
15
+
16
+ @staticmethod
17
+ def generate_claims(
18
+ input: str, assistant_output: str, multimodal: bool = False
19
+ ):
20
+ return textwrap.dedent(
21
+ f"""
22
+ Extract every factual-sounding claim asserted in the ASSISTANT'S OUTPUT.
23
+
24
+ A claim is any statement presented as fact, even if it is incorrect, vague, implied, or unverifiable.
25
+
26
+ RULES:
27
+ - Use ONLY the assistant's output as the source of claims.
28
+ - Use the user's preceding message ONLY to resolve pronouns or references, not as factual evidence.
29
+ - Extract claims exactly as stated without rewriting, summarizing, merging, or omitting details.
30
+ - If a sentence contains multiple factual assertions, extract each as a separate claim.
31
+ - Claims may involve text or images if multimodal.
32
+ - Do NOT add, infer, or transform information.
33
+
34
+ {TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
35
+
36
+ Output MUST be ONLY valid JSON:
37
+
38
+ {{
39
+ "claims": ["claim 1", "claim 2", ...]
40
+ }}
41
+
42
+ USER MESSAGE:
43
+ {input}
44
+
45
+ ASSISTANT OUTPUT:
46
+ {assistant_output}
47
+
48
+ JSON:
49
+ """
50
+ )
51
+
52
+ @staticmethod
53
+ def generate_truths(
54
+ reference_context: str,
55
+ extraction_limit: Optional[int],
56
+ multimodal: bool = False,
57
+ ):
58
+ if extraction_limit is None:
59
+ limit_description = "factual, explicit truths"
60
+ elif extraction_limit == 1:
61
+ limit_description = "one factual, explicit truth"
62
+ else:
63
+ limit_description = f"{extraction_limit} factual, explicit truths"
64
+
65
+ return textwrap.dedent(
66
+ f"""
67
+ Extract {limit_description} from the REFERENCE CONTEXT.
68
+
69
+ RULES:
70
+ - Truths must be atomic, explicit factual statements.
71
+ - Do not summarize or combine multiple facts.
72
+ - Select truths based on reading order, not 'importance'.
73
+ - Do not infer or expand beyond what is explicitly stated.
74
+ - Keep each truth minimal but complete.
75
+ - Treat images as factual evidence if multimodal, using only clearly visible information.
76
+
77
+ {TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
78
+
79
+ Output MUST be ONLY valid JSON:
80
+
81
+ {{
82
+ "truths": ["truth 1", "truth 2", ...]
83
+ }}
84
+
85
+ REFERENCE CONTEXT:
86
+ {reference_context}
87
+
88
+ JSON:
89
+ """
90
+ )
91
+
92
+ @staticmethod
93
+ def generate_verdicts(
94
+ claims: List[str], reference_context: str, multimodal: bool = False
95
+ ):
96
+ return textwrap.dedent(
97
+ f"""
98
+ For each claim, determine whether it is supported, contradicted, or not addressed by the reference context.
99
+
100
+ DEFINITIONS:
101
+ - "yes" = The claim is directly supported by at least one truth.
102
+ - "no" = The claim directly contradicts at least one truth.
103
+ - "idk" = The context does not confirm or contradict the claim.
104
+
105
+ RULES:
106
+ - One verdict per claim, in the same order.
107
+ - Do NOT use prior knowledge.
108
+ - Only use the explicit truths provided.
109
+ - A "yes" verdict must not include a reason.
110
+ - A "no" or "idk" verdict must include a concise reason that quotes or paraphrases only the truths.
111
+ - If a claim references an image and the visibility is unclear or ambiguous, use "idk".
112
+ - Do not create new facts or explanations.
113
+
114
+ {TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
115
+
116
+ Output MUST be ONLY valid JSON:
117
+
118
+ {{
119
+ "verdicts": [
120
+ {{
121
+ "verdict": "yes"
122
+ }},
123
+ {{
124
+ "verdict": "no",
125
+ "reason": "<explanation>"
126
+ }},
127
+ {{
128
+ "verdict": "idk",
129
+ "reason": "<explanation>"
130
+ }}
131
+ ]
132
+ }}
133
+
134
+ REFERENCE CONTEXT:
135
+ {reference_context}
136
+
137
+ CLAIMS:
138
+ {claims}
139
+
140
+ JSON:
141
+ """
142
+ )
143
+
144
+ @staticmethod
145
+ def generate_reason(
146
+ score: float, contradictions: List[str], multimodal: bool = False
147
+ ):
148
+ return textwrap.dedent(
149
+ f"""
150
+ Below is a list of contradictions extracted from verdicts. Write a concise justification of the score.
151
+
152
+ RULES:
153
+ - If contradictions exist, summarize them in 1-3 sentences.
154
+ - If no contradictions exist, respond:
155
+ {{
156
+ "reason": "No contradictions were found."
157
+ }}
158
+ - The summary must reference only the contradictions listed.
159
+ - Tone must be neutral and concise.
160
+ - No external knowledge may be used.
161
+
162
+ {TurnFaithfulnessTemplate.multimodal_rules if multimodal else ""}
163
+
164
+ Output MUST be ONLY valid JSON:
165
+
166
+ {{
167
+ "reason": "<summary>"
168
+ }}
169
+
170
+ FAITHFULNESS SCORE:
171
+ {score}
172
+
173
+ CONTRADICTIONS:
174
+ {contradictions}
175
+
176
+ JSON:
177
+ """
178
+ )
179
+
180
+ @staticmethod
181
+ def generate_final_reason(
182
+ final_score: float, success: bool, reasons: List[str]
183
+ ):
184
+ return textwrap.dedent(
185
+ f"""You are an AI evaluator producing a single final explanation for the TurnFaithfulnessMetric result.
186
+
187
+ Context:
188
+ This metric evaluates conversational faithfulness by extracting truths from retrieval context, extracting claims from the assistant's output, and generating verdicts that compare each claim against the truths. Each interaction yields a reason indicating why a verdict failed or succeeded. You are given all those reasons.
189
+
190
+ Inputs:
191
+ - final_score: the averaged score across all interactions.
192
+ - success: whether the metric passed or failed
193
+ - reasons: a list of textual reasons generated from individual verdicts.
194
+
195
+ Instructions:
196
+ 1. Read all reasons and synthesize them into one unified explanation.
197
+ 2. Describe patterns of claim-truth mismatches, contradictions, hallucinations, unsupported statements, or image-related errors if present.
198
+ 3. Do not repeat every reason; merge them into a concise, coherent narrative.
199
+ 5. If the metric failed, state the dominant failure modes. If it passed, state why the model's claims aligned with truths.
200
+ 6. Output a single paragraph with no lists, no bullets, no markup.
201
+
202
+ Output:
203
+ A single paragraph explaining the final outcome.
204
+
205
+ Here's the inputs:
206
+
207
+ Final Score: {final_score}
208
+
209
+ Reasons:
210
+ {reasons}
211
+
212
+ Success: {success}
213
+
214
+ Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
215
+
216
+ The final reason:
217
+ """
218
+ )