deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,301 @@
1
+ # Instruction Proposer for MIPROv2
2
+ #
3
+ # This module generates N diverse instruction candidates upfront,
4
+ # following the original MIPROv2 paper approach. Each candidate is
5
+ # generated with different "tips" (e.g., "be creative", "be concise")
6
+ # to encourage diversity in the instruction space.
7
+
8
+ from __future__ import annotations
9
+ import asyncio
10
+ import random
11
+ from typing import List, Optional, Union, TYPE_CHECKING
12
+
13
+ from deepeval.models.base_model import DeepEvalBaseLLM
14
+ from deepeval.prompt.prompt import Prompt
15
+ from deepeval.prompt.api import PromptType
16
+
17
+ if TYPE_CHECKING:
18
+ from deepeval.dataset.golden import Golden, ConversationalGolden
19
+
20
+
21
+ # Tips for encouraging diverse instruction generation (from DSPy MIPROv2)
22
+ INSTRUCTION_TIPS = [
23
+ "Be creative and think outside the box.",
24
+ "Be concise and direct.",
25
+ "Use step-by-step reasoning.",
26
+ "Focus on clarity and precision.",
27
+ "Include specific examples where helpful.",
28
+ "Emphasize the most important aspects.",
29
+ "Consider edge cases and exceptions.",
30
+ "Use structured formatting when appropriate.",
31
+ "Be thorough but avoid unnecessary details.",
32
+ "Prioritize accuracy over creativity.",
33
+ "Make the instruction self-contained.",
34
+ "Use natural, conversational language.",
35
+ "Be explicit about expected output format.",
36
+ "Include context about common mistakes to avoid.",
37
+ "Focus on the user's intent and goals.",
38
+ ]
39
+
40
+
41
+ class InstructionProposer:
42
+ """
43
+ Generates N diverse instruction candidates for a given prompt.
44
+
45
+ Following the MIPROv2 paper, this proposer:
46
+ 1. Analyzes the current prompt and task
47
+ 2. Optionally uses example inputs/outputs from goldens
48
+ 3. Applies different "tips" to encourage diversity
49
+ 4. Generates N candidate instructions
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ optimizer_model: DeepEvalBaseLLM,
55
+ random_state: Optional[Union[int, random.Random]] = None,
56
+ ):
57
+ self.optimizer_model = optimizer_model
58
+
59
+ if isinstance(random_state, int):
60
+ self.random_state = random.Random(random_state)
61
+ else:
62
+ self.random_state = random_state or random.Random()
63
+
64
+ def _format_prompt(self, prompt: Prompt) -> str:
65
+ """Format the prompt for the proposer context."""
66
+ if prompt.type == PromptType.LIST:
67
+ parts = []
68
+ for msg in prompt.messages_template:
69
+ role = msg.role or "unknown"
70
+ content = msg.content or ""
71
+ parts.append(f"[{role}]: {content}")
72
+ return "\n".join(parts)
73
+ else:
74
+ return prompt.text_template or ""
75
+
76
+ def _format_examples(
77
+ self,
78
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
79
+ max_examples: int = 3,
80
+ ) -> str:
81
+ """Format example inputs/outputs from goldens."""
82
+ if not goldens:
83
+ return "No examples available."
84
+
85
+ examples = []
86
+ sample = self.random_state.sample(
87
+ goldens, min(max_examples, len(goldens))
88
+ )
89
+
90
+ for i, golden in enumerate(sample, 1):
91
+ # Handle both Golden and ConversationalGolden
92
+ if hasattr(golden, "input"):
93
+ inp = str(golden.input)
94
+ out = str(golden.expected_output or "")
95
+ examples.append(
96
+ f"Example {i}:\n Input: {inp}\n Expected: {out}"
97
+ )
98
+ elif hasattr(golden, "messages"):
99
+ # ConversationalGolden
100
+ msgs = golden.messages[:2] if golden.messages else []
101
+ msg_str = " | ".join(str(m) for m in msgs)
102
+ examples.append(f"Example {i}: {msg_str}")
103
+
104
+ return "\n".join(examples) if examples else "No examples available."
105
+
106
+ def _compose_proposer_prompt(
107
+ self,
108
+ current_prompt: Prompt,
109
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
110
+ tip: str,
111
+ candidate_index: int,
112
+ ) -> str:
113
+ """Compose the prompt for generating an instruction candidate."""
114
+ prompt_text = self._format_prompt(current_prompt)
115
+ examples_text = self._format_examples(goldens)
116
+
117
+ return f"""You are an expert prompt engineer. Your task is to propose an improved instruction/prompt for an LLM task.
118
+
119
+ [CURRENT PROMPT]
120
+ {prompt_text}
121
+
122
+ [EXAMPLE INPUTS/OUTPUTS FROM THE TASK]
123
+ {examples_text}
124
+
125
+ [GENERATION TIP]
126
+ {tip}
127
+
128
+ [INSTRUCTIONS]
129
+ Based on the current prompt, the example task inputs/outputs, and the generation tip above, propose an improved version of the prompt.
130
+
131
+ This is candidate #{candidate_index + 1}. Make it meaningfully different from trivial variations.
132
+ Focus on improving clarity, effectiveness, and alignment with the task requirements.
133
+
134
+ Return ONLY the new prompt text, with no explanations or meta-commentary."""
135
+
136
+ def propose(
137
+ self,
138
+ prompt: Prompt,
139
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
140
+ num_candidates: int,
141
+ ) -> List[Prompt]:
142
+ """
143
+ Generate N instruction candidates synchronously.
144
+
145
+ Args:
146
+ prompt: The original prompt to improve
147
+ goldens: Example inputs/outputs for context
148
+ num_candidates: Number of candidates to generate
149
+
150
+ Returns:
151
+ List of Prompt candidates (including the original)
152
+ """
153
+ candidates: List[Prompt] = [prompt] # Always include original
154
+
155
+ # Select tips for diversity
156
+ tips = self._select_tips(num_candidates - 1)
157
+
158
+ for i, tip in enumerate(tips):
159
+ proposer_prompt = self._compose_proposer_prompt(
160
+ current_prompt=prompt,
161
+ goldens=goldens,
162
+ tip=tip,
163
+ candidate_index=i,
164
+ )
165
+
166
+ try:
167
+ output = self.optimizer_model.generate(proposer_prompt)
168
+ new_text = self._normalize_output(output)
169
+
170
+ if new_text and new_text.strip():
171
+ new_prompt = self._create_prompt_from_text(prompt, new_text)
172
+ if not self._is_duplicate(new_prompt, candidates):
173
+ candidates.append(new_prompt)
174
+ except Exception:
175
+ # Skip failed generations
176
+ continue
177
+
178
+ return candidates
179
+
180
+ async def a_propose(
181
+ self,
182
+ prompt: Prompt,
183
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
184
+ num_candidates: int,
185
+ ) -> List[Prompt]:
186
+ """
187
+ Generate N instruction candidates asynchronously (concurrently).
188
+ """
189
+ candidates: List[Prompt] = [prompt] # Always include original
190
+
191
+ tips = self._select_tips(num_candidates - 1)
192
+
193
+ # Build all proposer prompts upfront
194
+ proposer_prompts = [
195
+ self._compose_proposer_prompt(
196
+ current_prompt=prompt,
197
+ goldens=goldens,
198
+ tip=tip,
199
+ candidate_index=i,
200
+ )
201
+ for i, tip in enumerate(tips)
202
+ ]
203
+
204
+ # Generate all candidates concurrently
205
+ async def generate_one(proposer_prompt: str) -> Optional[str]:
206
+ try:
207
+ output = await self.optimizer_model.a_generate(proposer_prompt)
208
+ return self._normalize_output(output)
209
+ except Exception:
210
+ return None
211
+
212
+ results = await asyncio.gather(
213
+ *[generate_one(p) for p in proposer_prompts]
214
+ )
215
+
216
+ # Collect successful, non-duplicate candidates
217
+ for new_text in results:
218
+ if new_text and new_text.strip():
219
+ new_prompt = self._create_prompt_from_text(prompt, new_text)
220
+ if not self._is_duplicate(new_prompt, candidates):
221
+ candidates.append(new_prompt)
222
+
223
+ return candidates
224
+
225
+ def _select_tips(self, count: int) -> List[str]:
226
+ """Select diverse tips for candidate generation."""
227
+ if count <= 0:
228
+ return []
229
+
230
+ if count >= len(INSTRUCTION_TIPS):
231
+ # Use all tips, possibly repeating
232
+ tips = list(INSTRUCTION_TIPS)
233
+ while len(tips) < count:
234
+ tips.append(self.random_state.choice(INSTRUCTION_TIPS))
235
+ return tips[:count]
236
+
237
+ return self.random_state.sample(INSTRUCTION_TIPS, count)
238
+
239
+ def _normalize_output(self, output) -> str:
240
+ """Normalize LLM output to string."""
241
+ if isinstance(output, str):
242
+ return output.strip()
243
+ if isinstance(output, tuple):
244
+ return str(output[0]).strip() if output else ""
245
+ if isinstance(output, list):
246
+ return str(output[0]).strip() if output else ""
247
+ return str(output).strip()
248
+
249
+ def _create_prompt_from_text(
250
+ self, original: Prompt, new_text: str
251
+ ) -> Prompt:
252
+ """Create a new Prompt from generated text, preserving structure."""
253
+ if original.type == PromptType.LIST:
254
+ # For LIST prompts, update the system or first assistant message
255
+ new_messages = []
256
+ updated = False
257
+
258
+ for msg in original.messages_template:
259
+ if not updated and msg.role in ("system", "assistant"):
260
+ new_msg = type(msg)(role=msg.role, content=new_text)
261
+ new_messages.append(new_msg)
262
+ updated = True
263
+ else:
264
+ new_messages.append(msg)
265
+
266
+ if not updated and new_messages:
267
+ # Update the first message if no system/assistant found
268
+ first = new_messages[0]
269
+ new_messages[0] = type(first)(role=first.role, content=new_text)
270
+
271
+ return Prompt(messages_template=new_messages)
272
+ else:
273
+ return Prompt(text_template=new_text)
274
+
275
+ def _is_duplicate(self, new_prompt: Prompt, existing: List[Prompt]) -> bool:
276
+ """Check if a prompt is a duplicate of existing candidates."""
277
+ new_text = self._get_prompt_text(new_prompt).strip().lower()
278
+
279
+ for p in existing:
280
+ existing_text = self._get_prompt_text(p).strip().lower()
281
+ # Consider duplicates if >90% similar
282
+ if new_text == existing_text:
283
+ return True
284
+ # Simple similarity check
285
+ if len(new_text) > 0 and len(existing_text) > 0:
286
+ shorter = min(len(new_text), len(existing_text))
287
+ longer = max(len(new_text), len(existing_text))
288
+ if shorter / longer > 0.9:
289
+ # Check prefix similarity
290
+ if new_text[:shorter] == existing_text[:shorter]:
291
+ return True
292
+ return False
293
+
294
+ def _get_prompt_text(self, prompt: Prompt) -> str:
295
+ """Extract text from a prompt for comparison."""
296
+ if prompt.type == PromptType.LIST:
297
+ parts = []
298
+ for msg in prompt.messages_template:
299
+ parts.append(msg.content or "")
300
+ return " ".join(parts)
301
+ return prompt.text_template or ""
@@ -0,0 +1,5 @@
1
+ from .simba import SIMBA
2
+
3
+ __all__ = [
4
+ "SIMBA",
5
+ ]