deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -12,5 +12,11 @@ class ContextualRelevancyVerdicts(BaseModel):
12
12
  verdicts: List[ContextualRelevancyVerdict]
13
13
 
14
14
 
15
- class MultimodelContextualRelevancyScoreReason(BaseModel):
15
+ class ContextualRelevancyScoreReason(BaseModel):
16
16
  reason: str
17
+
18
+
19
+ class InteractionContextualRelevancyScore(BaseModel):
20
+ score: float
21
+ reason: str
22
+ verdicts: List[ContextualRelevancyVerdict]
@@ -0,0 +1,161 @@
1
+ from typing import List, Union
2
+ import textwrap
3
+ from deepeval.test_case import MLLMImage
4
+
5
+
6
+ class TurnContextualRelevancyTemplate:
7
+ multimodal_rules = """
8
+ --- MULTIMODAL INPUT RULES ---
9
+ - Treat image content as factual evidence.
10
+ - Only reference visual details that are explicitly and clearly visible.
11
+ - Do not infer or guess objects, text, or details not visibly present.
12
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
13
+ - When evaluating claims, compare them to BOTH textual and visual evidence.
14
+ - If the claim references something not clearly visible, respond with 'idk'.
15
+ """
16
+
17
+ @staticmethod
18
+ def generate_reason(
19
+ input: Union[str, List[Union[str, MLLMImage]]],
20
+ irrelevant_statements: List[str],
21
+ relevant_statements: List[str],
22
+ score: float,
23
+ multimodal: bool = False,
24
+ ):
25
+ # Note: irrelevancies parameter name in multimodal version is kept as irrelevant_statements for consistency
26
+ return textwrap.dedent(
27
+ f"""Based on the given user message, reasons for why the retrieval context is irrelevant to the user message, the statements in the retrieval context that is actually relevant to the retrieval context, and the contextual relevancy score (the closer to 1 the better), please generate a CONCISE reason for the score.
28
+ In your reason, you should quote data provided in the reasons for irrelevancy and relevant statements to support your point.
29
+
30
+ {TurnContextualRelevancyTemplate.multimodal_rules if multimodal else ""}
31
+
32
+ **
33
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
34
+ Example JSON:
35
+ {{
36
+ "reason": "The score is <contextual_relevancy_score> because <your_reason>."
37
+ }}
38
+
39
+ If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
40
+ **
41
+
42
+ Contextual Relevancy Score:
43
+ {score}
44
+
45
+ User Message:
46
+ {input}
47
+
48
+ Reasons for why the retrieval context is irrelevant to the user message:
49
+ {irrelevant_statements}
50
+
51
+ Statement in the retrieval context that is relevant to the user message:
52
+ {relevant_statements}
53
+
54
+ JSON:
55
+ """
56
+ )
57
+
58
+ @staticmethod
59
+ def generate_verdicts(
60
+ input: Union[str, List[Union[str, MLLMImage]]],
61
+ context: Union[str, List[Union[str, MLLMImage]]],
62
+ multimodal: bool = False,
63
+ ):
64
+ context_type = "context (image or string)" if multimodal else "context"
65
+ statement_or_image = "statement or image" if multimodal else "statement"
66
+
67
+ # Conditional instructions based on mode
68
+ extraction_instructions = ""
69
+ if multimodal:
70
+ extraction_instructions = textwrap.dedent(
71
+ """
72
+ If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
73
+ If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
74
+ """
75
+ ).strip()
76
+ else:
77
+ extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement."
78
+
79
+ # Additional instruction for empty context (only in non-multimodal)
80
+ empty_context_instruction = ""
81
+ if not multimodal:
82
+ empty_context_instruction = '\nIf provided context contains no actual content or statements then: give "no" as a "verdict",\nput context into "statement", and "No statements found in provided context." into "reason".'
83
+
84
+ return textwrap.dedent(
85
+ f"""Based on the user message and {context_type}, please generate a JSON object to indicate whether {'the context' if multimodal else 'each statement found in the context'} is relevant to the provided user message. The JSON will be a list of 'verdicts', with 2 mandatory fields: 'verdict' and 'statement', and 1 optional field: 'reason'.
86
+ {extraction_instructions}
87
+ The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the {statement_or_image} is relevant to the user message.
88
+ Provide a 'reason' ONLY IF verdict is no. You MUST quote the irrelevant parts of the {statement_or_image} to back up your reason.{empty_context_instruction}
89
+
90
+ {TurnContextualRelevancyTemplate.multimodal_rules if multimodal else ""}
91
+
92
+ **
93
+ IMPORTANT: Please make sure to only return in JSON format.
94
+ Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
95
+ Example User Message: "What were some of Einstein's achievements?"
96
+
97
+ Example:
98
+ {{
99
+ "verdicts": [
100
+ {{
101
+ "statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
102
+ "verdict": "yes"
103
+ }},
104
+ {{
105
+ "statement": "There was a cat.",
106
+ "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements.",
107
+ "verdict": "no"
108
+ }}
109
+ ]
110
+ }}
111
+ **
112
+
113
+ User Message:
114
+ {input}
115
+
116
+ Context:
117
+ {context}
118
+
119
+ JSON:
120
+ """
121
+ )
122
+
123
+ @staticmethod
124
+ def generate_final_reason(
125
+ final_score: float, success: bool, reasons: List[str]
126
+ ):
127
+ return textwrap.dedent(
128
+ f"""You are an AI evaluator producing a single final explanation for the TurnContextualRelevancyMetric result.
129
+
130
+ Context:
131
+ This metric evaluates conversational contextual relevancy by determining whether statements in the retrieval context are relevant to the user message for each interaction. Each interaction yields a reason indicating which statements were relevant or irrelevant. You are given all those reasons.
132
+
133
+ Inputs:
134
+ - final_score: the averaged score across all interactions.
135
+ - success: whether the metric passed or failed
136
+ - reasons: a list of textual reasons generated from individual interactions.
137
+
138
+ Instructions:
139
+ 1. Read all reasons and synthesize them into one unified explanation.
140
+ 2. Describe patterns of irrelevant statements, off-topic context, or well-targeted retrieval if present.
141
+ 3. Do not repeat every reason; merge them into a concise, coherent narrative.
142
+ 4. If the metric failed, state the dominant failure modes. If it passed, state why the retrieval context was relevant to user messages.
143
+ 5. Output a single paragraph with no lists, no bullets, no markup.
144
+
145
+ Output:
146
+ A single paragraph explaining the final outcome.
147
+
148
+ Here's the inputs:
149
+
150
+ Final Score: {final_score}
151
+
152
+ Reasons:
153
+ {reasons}
154
+
155
+ Success: {success}
156
+
157
+ Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
158
+
159
+ The final reason:
160
+ """
161
+ )