deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,301 @@
1
+ # Instruction Proposer for MIPROv2
2
+ #
3
+ # This module generates N diverse instruction candidates upfront,
4
+ # following the original MIPROv2 paper approach. Each candidate is
5
+ # generated with different "tips" (e.g., "be creative", "be concise")
6
+ # to encourage diversity in the instruction space.
7
+
8
+ from __future__ import annotations
9
+ import asyncio
10
+ import random
11
+ from typing import List, Optional, Union, TYPE_CHECKING
12
+
13
+ from deepeval.models.base_model import DeepEvalBaseLLM
14
+ from deepeval.prompt.prompt import Prompt
15
+ from deepeval.prompt.api import PromptType
16
+
17
+ if TYPE_CHECKING:
18
+ from deepeval.dataset.golden import Golden, ConversationalGolden
19
+
20
+
21
+ # Tips for encouraging diverse instruction generation (from DSPy MIPROv2)
22
+ INSTRUCTION_TIPS = [
23
+ "Be creative and think outside the box.",
24
+ "Be concise and direct.",
25
+ "Use step-by-step reasoning.",
26
+ "Focus on clarity and precision.",
27
+ "Include specific examples where helpful.",
28
+ "Emphasize the most important aspects.",
29
+ "Consider edge cases and exceptions.",
30
+ "Use structured formatting when appropriate.",
31
+ "Be thorough but avoid unnecessary details.",
32
+ "Prioritize accuracy over creativity.",
33
+ "Make the instruction self-contained.",
34
+ "Use natural, conversational language.",
35
+ "Be explicit about expected output format.",
36
+ "Include context about common mistakes to avoid.",
37
+ "Focus on the user's intent and goals.",
38
+ ]
39
+
40
+
41
+ class InstructionProposer:
42
+ """
43
+ Generates N diverse instruction candidates for a given prompt.
44
+
45
+ Following the MIPROv2 paper, this proposer:
46
+ 1. Analyzes the current prompt and task
47
+ 2. Optionally uses example inputs/outputs from goldens
48
+ 3. Applies different "tips" to encourage diversity
49
+ 4. Generates N candidate instructions
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ optimizer_model: DeepEvalBaseLLM,
55
+ random_state: Optional[Union[int, random.Random]] = None,
56
+ ):
57
+ self.optimizer_model = optimizer_model
58
+
59
+ if isinstance(random_state, int):
60
+ self.random_state = random.Random(random_state)
61
+ else:
62
+ self.random_state = random_state or random.Random()
63
+
64
+ def _format_prompt(self, prompt: Prompt) -> str:
65
+ """Format the prompt for the proposer context."""
66
+ if prompt.type == PromptType.LIST:
67
+ parts = []
68
+ for msg in prompt.messages_template:
69
+ role = msg.role or "unknown"
70
+ content = msg.content or ""
71
+ parts.append(f"[{role}]: {content}")
72
+ return "\n".join(parts)
73
+ else:
74
+ return prompt.text_template or ""
75
+
76
+ def _format_examples(
77
+ self,
78
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
79
+ max_examples: int = 3,
80
+ ) -> str:
81
+ """Format example inputs/outputs from goldens."""
82
+ if not goldens:
83
+ return "No examples available."
84
+
85
+ examples = []
86
+ sample = self.random_state.sample(
87
+ goldens, min(max_examples, len(goldens))
88
+ )
89
+
90
+ for i, golden in enumerate(sample, 1):
91
+ # Handle both Golden and ConversationalGolden
92
+ if hasattr(golden, "input"):
93
+ inp = str(golden.input)
94
+ out = str(golden.expected_output or "")
95
+ examples.append(
96
+ f"Example {i}:\n Input: {inp}\n Expected: {out}"
97
+ )
98
+ elif hasattr(golden, "messages"):
99
+ # ConversationalGolden
100
+ msgs = golden.messages[:2] if golden.messages else []
101
+ msg_str = " | ".join(str(m) for m in msgs)
102
+ examples.append(f"Example {i}: {msg_str}")
103
+
104
+ return "\n".join(examples) if examples else "No examples available."
105
+
106
+ def _compose_proposer_prompt(
107
+ self,
108
+ current_prompt: Prompt,
109
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
110
+ tip: str,
111
+ candidate_index: int,
112
+ ) -> str:
113
+ """Compose the prompt for generating an instruction candidate."""
114
+ prompt_text = self._format_prompt(current_prompt)
115
+ examples_text = self._format_examples(goldens)
116
+
117
+ return f"""You are an expert prompt engineer. Your task is to propose an improved instruction/prompt for an LLM task.
118
+
119
+ [CURRENT PROMPT]
120
+ {prompt_text}
121
+
122
+ [EXAMPLE INPUTS/OUTPUTS FROM THE TASK]
123
+ {examples_text}
124
+
125
+ [GENERATION TIP]
126
+ {tip}
127
+
128
+ [INSTRUCTIONS]
129
+ Based on the current prompt, the example task inputs/outputs, and the generation tip above, propose an improved version of the prompt.
130
+
131
+ This is candidate #{candidate_index + 1}. Make it meaningfully different from trivial variations.
132
+ Focus on improving clarity, effectiveness, and alignment with the task requirements.
133
+
134
+ Return ONLY the new prompt text, with no explanations or meta-commentary."""
135
+
136
+ def propose(
137
+ self,
138
+ prompt: Prompt,
139
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
140
+ num_candidates: int,
141
+ ) -> List[Prompt]:
142
+ """
143
+ Generate N instruction candidates synchronously.
144
+
145
+ Args:
146
+ prompt: The original prompt to improve
147
+ goldens: Example inputs/outputs for context
148
+ num_candidates: Number of candidates to generate
149
+
150
+ Returns:
151
+ List of Prompt candidates (including the original)
152
+ """
153
+ candidates: List[Prompt] = [prompt] # Always include original
154
+
155
+ # Select tips for diversity
156
+ tips = self._select_tips(num_candidates - 1)
157
+
158
+ for i, tip in enumerate(tips):
159
+ proposer_prompt = self._compose_proposer_prompt(
160
+ current_prompt=prompt,
161
+ goldens=goldens,
162
+ tip=tip,
163
+ candidate_index=i,
164
+ )
165
+
166
+ try:
167
+ output = self.optimizer_model.generate(proposer_prompt)
168
+ new_text = self._normalize_output(output)
169
+
170
+ if new_text and new_text.strip():
171
+ new_prompt = self._create_prompt_from_text(prompt, new_text)
172
+ if not self._is_duplicate(new_prompt, candidates):
173
+ candidates.append(new_prompt)
174
+ except Exception:
175
+ # Skip failed generations
176
+ continue
177
+
178
+ return candidates
179
+
180
+ async def a_propose(
181
+ self,
182
+ prompt: Prompt,
183
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
184
+ num_candidates: int,
185
+ ) -> List[Prompt]:
186
+ """
187
+ Generate N instruction candidates asynchronously (concurrently).
188
+ """
189
+ candidates: List[Prompt] = [prompt] # Always include original
190
+
191
+ tips = self._select_tips(num_candidates - 1)
192
+
193
+ # Build all proposer prompts upfront
194
+ proposer_prompts = [
195
+ self._compose_proposer_prompt(
196
+ current_prompt=prompt,
197
+ goldens=goldens,
198
+ tip=tip,
199
+ candidate_index=i,
200
+ )
201
+ for i, tip in enumerate(tips)
202
+ ]
203
+
204
+ # Generate all candidates concurrently
205
+ async def generate_one(proposer_prompt: str) -> Optional[str]:
206
+ try:
207
+ output = await self.optimizer_model.a_generate(proposer_prompt)
208
+ return self._normalize_output(output)
209
+ except Exception:
210
+ return None
211
+
212
+ results = await asyncio.gather(
213
+ *[generate_one(p) for p in proposer_prompts]
214
+ )
215
+
216
+ # Collect successful, non-duplicate candidates
217
+ for new_text in results:
218
+ if new_text and new_text.strip():
219
+ new_prompt = self._create_prompt_from_text(prompt, new_text)
220
+ if not self._is_duplicate(new_prompt, candidates):
221
+ candidates.append(new_prompt)
222
+
223
+ return candidates
224
+
225
+ def _select_tips(self, count: int) -> List[str]:
226
+ """Select diverse tips for candidate generation."""
227
+ if count <= 0:
228
+ return []
229
+
230
+ if count >= len(INSTRUCTION_TIPS):
231
+ # Use all tips, possibly repeating
232
+ tips = list(INSTRUCTION_TIPS)
233
+ while len(tips) < count:
234
+ tips.append(self.random_state.choice(INSTRUCTION_TIPS))
235
+ return tips[:count]
236
+
237
+ return self.random_state.sample(INSTRUCTION_TIPS, count)
238
+
239
+ def _normalize_output(self, output) -> str:
240
+ """Normalize LLM output to string."""
241
+ if isinstance(output, str):
242
+ return output.strip()
243
+ if isinstance(output, tuple):
244
+ return str(output[0]).strip() if output else ""
245
+ if isinstance(output, list):
246
+ return str(output[0]).strip() if output else ""
247
+ return str(output).strip()
248
+
249
+ def _create_prompt_from_text(
250
+ self, original: Prompt, new_text: str
251
+ ) -> Prompt:
252
+ """Create a new Prompt from generated text, preserving structure."""
253
+ if original.type == PromptType.LIST:
254
+ # For LIST prompts, update the system or first assistant message
255
+ new_messages = []
256
+ updated = False
257
+
258
+ for msg in original.messages_template:
259
+ if not updated and msg.role in ("system", "assistant"):
260
+ new_msg = type(msg)(role=msg.role, content=new_text)
261
+ new_messages.append(new_msg)
262
+ updated = True
263
+ else:
264
+ new_messages.append(msg)
265
+
266
+ if not updated and new_messages:
267
+ # Update the first message if no system/assistant found
268
+ first = new_messages[0]
269
+ new_messages[0] = type(first)(role=first.role, content=new_text)
270
+
271
+ return Prompt(messages_template=new_messages)
272
+ else:
273
+ return Prompt(text_template=new_text)
274
+
275
+ def _is_duplicate(self, new_prompt: Prompt, existing: List[Prompt]) -> bool:
276
+ """Check if a prompt is a duplicate of existing candidates."""
277
+ new_text = self._get_prompt_text(new_prompt).strip().lower()
278
+
279
+ for p in existing:
280
+ existing_text = self._get_prompt_text(p).strip().lower()
281
+ # Consider duplicates if >90% similar
282
+ if new_text == existing_text:
283
+ return True
284
+ # Simple similarity check
285
+ if len(new_text) > 0 and len(existing_text) > 0:
286
+ shorter = min(len(new_text), len(existing_text))
287
+ longer = max(len(new_text), len(existing_text))
288
+ if shorter / longer > 0.9:
289
+ # Check prefix similarity
290
+ if new_text[:shorter] == existing_text[:shorter]:
291
+ return True
292
+ return False
293
+
294
+ def _get_prompt_text(self, prompt: Prompt) -> str:
295
+ """Extract text from a prompt for comparison."""
296
+ if prompt.type == PromptType.LIST:
297
+ parts = []
298
+ for msg in prompt.messages_template:
299
+ parts.append(msg.content or "")
300
+ return " ".join(parts)
301
+ return prompt.text_template or ""
@@ -0,0 +1,5 @@
1
+ from .simba import SIMBA
2
+
3
+ __all__ = [
4
+ "SIMBA",
5
+ ]