deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,752 @@
1
+ # MIPROv2 - Multiprompt Instruction PRoposal Optimizer Version 2
2
+ #
3
+ # This implementation follows the original MIPROv2 paper and DSPy implementation:
4
+ # https://arxiv.org/pdf/2406.11695
5
+ # https://dspy.ai/api/optimizers/MIPROv2/
6
+ #
7
+ # The algorithm works in two phases:
8
+ #
9
+ # 1. PROPOSAL PHASE:
10
+ # a) Generate N diverse instruction candidates upfront
11
+ # b) Bootstrap few-shot demonstration sets from training data
12
+ #
13
+ # 2. OPTIMIZATION PHASE: Use Bayesian Optimization (Optuna TPE) to search
14
+ # over the joint space of (instruction_candidate, demo_set). Each trial:
15
+ # - Samples an instruction candidate index
16
+ # - Samples a demo set index
17
+ # - Renders the prompt with demos
18
+ # - Evaluates on a minibatch of examples
19
+ # - Uses the score to guide the Bayesian surrogate model
20
+ #
21
+ # Periodic full evaluation is performed every `minibatch_full_eval_steps`
22
+ # to get accurate scores on the complete validation set.
23
+
24
+
25
+ from __future__ import annotations
26
+ import asyncio
27
+ import uuid
28
+ import random
29
+ import time
30
+ import logging
31
+ from typing import (
32
+ Dict,
33
+ List,
34
+ Tuple,
35
+ TYPE_CHECKING,
36
+ Union,
37
+ Optional,
38
+ Callable,
39
+ )
40
+
41
+ try:
42
+ import optuna
43
+ from optuna.samplers import TPESampler
44
+
45
+ OPTUNA_AVAILABLE = True
46
+ except ImportError:
47
+ OPTUNA_AVAILABLE = False
48
+ optuna = None
49
+ TPESampler = None
50
+
51
+ from deepeval.models.base_model import DeepEvalBaseLLM
52
+ from deepeval.errors import DeepEvalError
53
+ from deepeval.optimizer.utils import Aggregator, mean_of_all
54
+ from deepeval.optimizer.types import (
55
+ PromptConfiguration,
56
+ PromptConfigurationId,
57
+ ModuleId,
58
+ ScoreTable,
59
+ OptimizationReport,
60
+ RunnerStatusType,
61
+ RunnerStatusCallback,
62
+ )
63
+ from deepeval.optimizer.scorer.base import BaseScorer
64
+ from deepeval.optimizer.algorithms.base import BaseAlgorithm
65
+ from deepeval.optimizer.utils import build_prompt_config_snapshots
66
+ from deepeval.prompt.prompt import Prompt
67
+ from deepeval.optimizer.algorithms.miprov2.proposer import InstructionProposer
68
+ from deepeval.optimizer.algorithms.miprov2.bootstrapper import (
69
+ DemoBootstrapper,
70
+ DemoSet,
71
+ render_prompt_with_demos,
72
+ )
73
+ from deepeval.optimizer.algorithms.configs import (
74
+ MIPROV2_DEFAULT_NUM_CANDIDATES,
75
+ MIPROV2_DEFAULT_NUM_TRIALS,
76
+ MIPROV2_DEFAULT_MINIBATCH_SIZE,
77
+ MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS,
78
+ MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS,
79
+ MIPROV2_DEFAULT_MAX_LABELED_DEMOS,
80
+ MIPROV2_DEFAULT_NUM_DEMO_SETS,
81
+ )
82
+
83
+ if TYPE_CHECKING:
84
+ from deepeval.dataset.golden import Golden, ConversationalGolden
85
+
86
+
87
+ # Suppress Optuna's verbose logging
88
+ logging.getLogger("optuna").setLevel(logging.WARNING)
89
+
90
+
91
+ class MIPROV2(BaseAlgorithm):
92
+ """
93
+ MIPROv2 (Multiprompt Instruction PRoposal Optimizer Version 2)
94
+
95
+ A prompt optimizer that uses Bayesian Optimization to find the best
96
+ combination of instruction and few-shot demonstrations. Follows the
97
+ original MIPROv2 paper approach.
98
+
99
+ The optimization process:
100
+ 1. Generate N diverse instruction candidates upfront
101
+ 2. Bootstrap M demo sets from training examples
102
+ 3. Use Optuna's TPE sampler for Bayesian Optimization over (instruction, demos)
103
+ 4. Each trial evaluates a combination on a minibatch
104
+ 5. Periodically evaluate the best combination on the full dataset
105
+
106
+ Parameters
107
+ ----------
108
+ num_candidates : int
109
+ Number of instruction candidates to propose. Default is 10.
110
+ num_trials : int
111
+ Number of Bayesian Optimization trials. Default is 20.
112
+ minibatch_size : int
113
+ Number of examples per minibatch evaluation. Default is 25.
114
+ minibatch_full_eval_steps : int
115
+ Evaluate best on full dataset every N trials. Default is 10.
116
+ max_bootstrapped_demos : int
117
+ Maximum bootstrapped demos per demo set. Default is 4.
118
+ max_labeled_demos : int
119
+ Maximum labeled demos (from expected_output) per set. Default is 4.
120
+ num_demo_sets : int
121
+ Number of demo sets to create. Default is 5.
122
+ random_seed : int, optional
123
+ RNG seed for reproducibility. If None, derived from time.time_ns().
124
+ aggregate_instances : Aggregator
125
+ Function to aggregate per-instance scores. Default is mean_of_all.
126
+ scorer : BaseScorer, optional
127
+ Scorer for evaluating prompts. Set by PromptOptimizer.
128
+ """
129
+
130
+ name = "MIPROv2"
131
+ SINGLE_MODULE_ID: ModuleId = "__module__"
132
+
133
+ def __init__(
134
+ self,
135
+ num_candidates: int = MIPROV2_DEFAULT_NUM_CANDIDATES,
136
+ num_trials: int = MIPROV2_DEFAULT_NUM_TRIALS,
137
+ minibatch_size: int = MIPROV2_DEFAULT_MINIBATCH_SIZE,
138
+ minibatch_full_eval_steps: int = MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS,
139
+ max_bootstrapped_demos: int = MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS,
140
+ max_labeled_demos: int = MIPROV2_DEFAULT_MAX_LABELED_DEMOS,
141
+ num_demo_sets: int = MIPROV2_DEFAULT_NUM_DEMO_SETS,
142
+ random_seed: Optional[int] = None,
143
+ aggregate_instances: Aggregator = mean_of_all,
144
+ scorer: Optional[BaseScorer] = None,
145
+ ) -> None:
146
+ if not OPTUNA_AVAILABLE:
147
+ raise DeepEvalError(
148
+ "MIPROv2 requires the 'optuna' package for Bayesian Optimization. "
149
+ "Install it with: pip install optuna"
150
+ )
151
+
152
+ # Validate parameters
153
+ if num_candidates < 1:
154
+ raise ValueError("num_candidates must be >= 1")
155
+ if num_trials < 1:
156
+ raise ValueError("num_trials must be >= 1")
157
+ if minibatch_size < 1:
158
+ raise ValueError("minibatch_size must be >= 1")
159
+ if minibatch_full_eval_steps < 1:
160
+ raise ValueError("minibatch_full_eval_steps must be >= 1")
161
+ if max_bootstrapped_demos < 0:
162
+ raise ValueError("max_bootstrapped_demos must be >= 0")
163
+ if max_labeled_demos < 0:
164
+ raise ValueError("max_labeled_demos must be >= 0")
165
+ if num_demo_sets < 1:
166
+ raise ValueError("num_demo_sets must be >= 1")
167
+
168
+ self.num_candidates = num_candidates
169
+ self.num_trials = num_trials
170
+ self.minibatch_size = minibatch_size
171
+ self.minibatch_full_eval_steps = minibatch_full_eval_steps
172
+ self.max_bootstrapped_demos = max_bootstrapped_demos
173
+ self.max_labeled_demos = max_labeled_demos
174
+ self.num_demo_sets = num_demo_sets
175
+ self.aggregate_instances = aggregate_instances
176
+ self.scorer = scorer
177
+
178
+ # Random seed handling
179
+ if random_seed is None:
180
+ random_seed = time.time_ns() % (2**31)
181
+ self.random_seed = random_seed
182
+ self.random_state = random.Random(random_seed)
183
+
184
+ # Runtime state
185
+ self.reset_state()
186
+
187
+ # Callbacks and models (set by PromptOptimizer)
188
+ self.status_callback: Optional[RunnerStatusCallback] = None
189
+ self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
190
+
191
+ # Lazy-loaded components
192
+ self._proposer: Optional[InstructionProposer] = None
193
+ self._bootstrapper: Optional[DemoBootstrapper] = None
194
+
195
+ ##############
196
+ # Public API #
197
+ ##############
198
+
199
+ def execute(
200
+ self,
201
+ prompt: Prompt,
202
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
203
+ ) -> Tuple[Prompt, OptimizationReport]:
204
+ """
205
+ Synchronous MIPROv2 optimization.
206
+
207
+ Phase 1: Propose instruction candidates + Bootstrap demo sets
208
+ Phase 2: Use Bayesian Optimization to find the best combination
209
+ """
210
+ self._validate_inputs(goldens)
211
+ self._ensure_scorer()
212
+ self._ensure_proposer()
213
+ self._ensure_bootstrapper()
214
+ self.reset_state()
215
+
216
+ # Phase 1a: Propose instruction candidates
217
+ self._update_status("Phase 1: Proposing instruction candidates...", 0)
218
+ instruction_candidates = self._proposer.propose(
219
+ prompt=prompt,
220
+ goldens=goldens,
221
+ num_candidates=self.num_candidates,
222
+ )
223
+ self._register_instruction_candidates(instruction_candidates)
224
+
225
+ # Phase 1b: Bootstrap demo sets
226
+ self._update_status(
227
+ "Phase 1: Bootstrapping few-shot demonstrations...", 0
228
+ )
229
+ self._demo_sets = self._bootstrapper.bootstrap(
230
+ prompt=prompt,
231
+ goldens=goldens,
232
+ generate_fn=self._create_generate_fn(),
233
+ )
234
+ self._update_status(f"Bootstrapped {len(self._demo_sets)} demo sets", 0)
235
+
236
+ # Phase 2: Bayesian Optimization over (instruction, demos)
237
+ self._update_status("Phase 2: Starting Bayesian Optimization...", 0)
238
+ best_instr_idx, best_demo_idx = self._run_bayesian_optimization(goldens)
239
+
240
+ # Final full evaluation if not already done
241
+ config_key = (best_instr_idx, best_demo_idx)
242
+ if config_key not in self._full_eval_cache:
243
+ best_config = self._get_config_by_index(best_instr_idx)
244
+ best_demo_set = self._demo_sets[best_demo_idx]
245
+ self._full_evaluate(best_config, best_demo_set, goldens)
246
+
247
+ # Build report
248
+ best = self._best_by_aggregate()
249
+ return self._build_result(best)
250
+
251
+ async def a_execute(
252
+ self,
253
+ prompt: Prompt,
254
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
255
+ ) -> Tuple[Prompt, OptimizationReport]:
256
+ """
257
+ Asynchronous MIPROv2 optimization.
258
+ """
259
+ self._validate_inputs(goldens)
260
+ self._ensure_scorer()
261
+ self._ensure_proposer()
262
+ self._ensure_bootstrapper()
263
+ self.reset_state()
264
+
265
+ # Phase 1: Run proposal and bootstrapping concurrently
266
+ self._update_status(
267
+ "Phase 1: Proposing candidates & bootstrapping demos...", 0
268
+ )
269
+
270
+ instruction_candidates, demo_sets = await asyncio.gather(
271
+ self._proposer.a_propose(
272
+ prompt=prompt,
273
+ goldens=goldens,
274
+ num_candidates=self.num_candidates,
275
+ ),
276
+ self._bootstrapper.a_bootstrap(
277
+ prompt=prompt,
278
+ goldens=goldens,
279
+ a_generate_fn=self._create_async_generate_fn(),
280
+ ),
281
+ )
282
+
283
+ self._register_instruction_candidates(instruction_candidates)
284
+ self._demo_sets = demo_sets
285
+ self._update_status(
286
+ f"Generated {len(instruction_candidates)} candidates, {len(self._demo_sets)} demo sets",
287
+ 0,
288
+ )
289
+
290
+ # Phase 2: Bayesian Optimization
291
+ self._update_status("Phase 2: Starting Bayesian Optimization...", 0)
292
+ best_instr_idx, best_demo_idx = await self._a_run_bayesian_optimization(
293
+ goldens
294
+ )
295
+
296
+ # Final full evaluation if not already done
297
+ config_key = (best_instr_idx, best_demo_idx)
298
+ if config_key not in self._full_eval_cache:
299
+ best_config = self._get_config_by_index(best_instr_idx)
300
+ best_demo_set = self._demo_sets[best_demo_idx]
301
+ await self._a_full_evaluate(best_config, best_demo_set, goldens)
302
+
303
+ # Build report
304
+ best = self._best_by_aggregate()
305
+ return self._build_result(best)
306
+
307
+ ###################
308
+ # State & Helpers #
309
+ ###################
310
+
311
+ def reset_state(self) -> None:
312
+ """Reset optimization state for a new run."""
313
+ self.optimization_id = str(uuid.uuid4())
314
+ self.prompt_configurations_by_id: Dict[
315
+ PromptConfigurationId, PromptConfiguration
316
+ ] = {}
317
+ self.parents_by_id: Dict[
318
+ PromptConfigurationId, Optional[PromptConfigurationId]
319
+ ] = {}
320
+ self.pareto_score_table: ScoreTable = {}
321
+
322
+ # Candidate tracking
323
+ self._instruction_candidates: List[PromptConfiguration] = []
324
+ self._demo_sets: List[DemoSet] = []
325
+
326
+ # Score tracking: (instr_idx, demo_idx) -> list of minibatch scores
327
+ self._combination_scores: Dict[Tuple[int, int], List[float]] = {}
328
+
329
+ # Full eval cache: (instr_idx, demo_idx) -> config_id
330
+ self._full_eval_cache: Dict[Tuple[int, int], PromptConfigurationId] = {}
331
+
332
+ # Trial tracking
333
+ self._trial_history: List[Dict] = []
334
+ self._best_trial_key: Tuple[int, int] = (0, 0)
335
+ self._best_trial_score: float = float("-inf")
336
+
337
+ def _validate_inputs(
338
+ self,
339
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
340
+ ) -> None:
341
+ """Validate input parameters."""
342
+ if len(goldens) < 1:
343
+ raise DeepEvalError(
344
+ "MIPROv2 prompt optimization requires at least 1 golden, but "
345
+ f"received {len(goldens)}. Provide at least one golden to run "
346
+ "the optimizer."
347
+ )
348
+
349
+ def _ensure_scorer(self) -> None:
350
+ """Ensure scorer is configured."""
351
+ if self.scorer is None:
352
+ raise DeepEvalError(
353
+ "MIPROv2 requires a `scorer`. "
354
+ "Construct one in PromptOptimizer and assign it to `runner.scorer`."
355
+ )
356
+
357
+ def _ensure_proposer(self) -> None:
358
+ """Lazily initialize the instruction proposer."""
359
+ if self._proposer is None:
360
+ if self.optimizer_model is None:
361
+ raise DeepEvalError(
362
+ "MIPROv2 requires an `optimizer_model` for instruction proposal. "
363
+ "Set it via PromptOptimizer."
364
+ )
365
+ self._proposer = InstructionProposer(
366
+ optimizer_model=self.optimizer_model,
367
+ random_state=self.random_state,
368
+ )
369
+
370
+ def _ensure_bootstrapper(self) -> None:
371
+ """Lazily initialize the demo bootstrapper."""
372
+ if self._bootstrapper is None:
373
+ self._bootstrapper = DemoBootstrapper(
374
+ max_bootstrapped_demos=self.max_bootstrapped_demos,
375
+ max_labeled_demos=self.max_labeled_demos,
376
+ num_demo_sets=self.num_demo_sets,
377
+ random_state=self.random_state,
378
+ )
379
+
380
+ def _create_generate_fn(
381
+ self,
382
+ ) -> Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str]:
383
+ """Create a sync generate function for bootstrapping."""
384
+
385
+ def generate_fn(
386
+ prompt: Prompt,
387
+ golden: Union["Golden", "ConversationalGolden"],
388
+ ) -> str:
389
+ # Create a temporary config for generation
390
+ temp_config = PromptConfiguration.new(
391
+ prompts={self.SINGLE_MODULE_ID: prompt}
392
+ )
393
+ return self.scorer.generate(temp_config.prompts, golden)
394
+
395
+ return generate_fn
396
+
397
+ def _create_async_generate_fn(self) -> Callable:
398
+ """Create an async generate function for bootstrapping."""
399
+
400
+ async def a_generate_fn(
401
+ prompt: Prompt,
402
+ golden: Union["Golden", "ConversationalGolden"],
403
+ ) -> str:
404
+ temp_config = PromptConfiguration.new(
405
+ prompts={self.SINGLE_MODULE_ID: prompt}
406
+ )
407
+ return await self.scorer.a_generate(temp_config.prompts, golden)
408
+
409
+ return a_generate_fn
410
+
411
+ def _register_instruction_candidates(
412
+ self, candidates: List[Prompt]
413
+ ) -> None:
414
+ """Register all instruction candidates as configurations."""
415
+ for i, prompt in enumerate(candidates):
416
+ config = PromptConfiguration.new(
417
+ prompts={self.SINGLE_MODULE_ID: prompt},
418
+ parent=None if i == 0 else self._instruction_candidates[0].id,
419
+ )
420
+ self._instruction_candidates.append(config)
421
+ self.prompt_configurations_by_id[config.id] = config
422
+ self.parents_by_id[config.id] = config.parent
423
+
424
+ def _get_config_by_index(self, idx: int) -> PromptConfiguration:
425
+ """Get configuration by instruction candidate index."""
426
+ return self._instruction_candidates[idx]
427
+
428
+ def _draw_minibatch(
429
+ self,
430
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
431
+ ) -> Union[List["Golden"], List["ConversationalGolden"]]:
432
+ """Sample a minibatch from goldens."""
433
+ n = len(goldens)
434
+ if n <= 0:
435
+ return []
436
+ size = min(self.minibatch_size, n)
437
+ return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
438
+
439
+ def _render_config_with_demos(
440
+ self,
441
+ config: PromptConfiguration,
442
+ demo_set: DemoSet,
443
+ ) -> PromptConfiguration:
444
+ """Create a new config with demos rendered into the prompt."""
445
+ base_prompt = config.prompts[self.SINGLE_MODULE_ID]
446
+ rendered_prompt = render_prompt_with_demos(
447
+ prompt=base_prompt,
448
+ demo_set=demo_set,
449
+ max_demos=self.max_bootstrapped_demos + self.max_labeled_demos,
450
+ )
451
+
452
+ # Create a new config with the rendered prompt
453
+ rendered_config = PromptConfiguration.new(
454
+ prompts={self.SINGLE_MODULE_ID: rendered_prompt},
455
+ parent=config.id,
456
+ )
457
+ return rendered_config
458
+
459
+ ############################
460
+ # Bayesian Optimization #
461
+ ############################
462
+
463
+ def _run_bayesian_optimization(
464
+ self,
465
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
466
+ ) -> Tuple[int, int]:
467
+ """
468
+ Run Bayesian Optimization using Optuna's TPE sampler.
469
+ Returns the (instruction_idx, demo_set_idx) of the best combination.
470
+ """
471
+ num_instructions = len(self._instruction_candidates)
472
+ num_demo_sets = len(self._demo_sets)
473
+
474
+ # Create Optuna study with TPE sampler
475
+ sampler = TPESampler(seed=self.random_seed)
476
+ study = optuna.create_study(
477
+ direction="maximize",
478
+ sampler=sampler,
479
+ )
480
+
481
+ def objective(trial: "optuna.Trial") -> float:
482
+ # Sample instruction and demo set indices
483
+ instr_idx = trial.suggest_int("instr_idx", 0, num_instructions - 1)
484
+ demo_idx = trial.suggest_int("demo_idx", 0, num_demo_sets - 1)
485
+
486
+ # Get the configuration and demo set
487
+ config = self._get_config_by_index(instr_idx)
488
+ demo_set = self._demo_sets[demo_idx]
489
+
490
+ # Render prompt with demos
491
+ rendered_config = self._render_config_with_demos(config, demo_set)
492
+
493
+ # Draw minibatch and score
494
+ minibatch = self._draw_minibatch(goldens)
495
+ score = self.scorer.score_minibatch(rendered_config, minibatch)
496
+
497
+ # Track scores for this combination
498
+ combo_key = (instr_idx, demo_idx)
499
+ if combo_key not in self._combination_scores:
500
+ self._combination_scores[combo_key] = []
501
+ self._combination_scores[combo_key].append(score)
502
+
503
+ # Update best tracking
504
+ if score > self._best_trial_score:
505
+ self._best_trial_score = score
506
+ self._best_trial_key = combo_key
507
+
508
+ # Record trial
509
+ trial_num = len(self._trial_history) + 1
510
+ self._trial_history.append(
511
+ {
512
+ "trial": trial_num,
513
+ "instr_idx": instr_idx,
514
+ "demo_idx": demo_idx,
515
+ "score": score,
516
+ }
517
+ )
518
+
519
+ # Progress update
520
+ demo_info = (
521
+ f"{len(demo_set.demos)} demos" if demo_set.demos else "0-shot"
522
+ )
523
+ self._update_status(
524
+ f"Trial {trial_num}/{self.num_trials} - "
525
+ f"Instr {instr_idx}, {demo_info} - Score: {score:.4f}",
526
+ trial_num,
527
+ )
528
+
529
+ # Periodic full evaluation
530
+ if trial_num % self.minibatch_full_eval_steps == 0:
531
+ best_instr, best_demo = self._best_trial_key
532
+ if (best_instr, best_demo) not in self._full_eval_cache:
533
+ best_config = self._get_config_by_index(best_instr)
534
+ best_demo_set = self._demo_sets[best_demo]
535
+ self._full_evaluate(best_config, best_demo_set, goldens)
536
+
537
+ return score
538
+
539
+ # Run optimization
540
+ study.optimize(
541
+ objective,
542
+ n_trials=self.num_trials,
543
+ show_progress_bar=False,
544
+ )
545
+
546
+ # Return the best combination
547
+ return (
548
+ study.best_params["instr_idx"],
549
+ study.best_params["demo_idx"],
550
+ )
551
+
552
+ async def _a_run_bayesian_optimization(
553
+ self,
554
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
555
+ ) -> Tuple[int, int]:
556
+ """
557
+ Async version of Bayesian Optimization.
558
+ """
559
+ num_instructions = len(self._instruction_candidates)
560
+ num_demo_sets = len(self._demo_sets)
561
+
562
+ sampler = TPESampler(seed=self.random_seed)
563
+ study = optuna.create_study(
564
+ direction="maximize",
565
+ sampler=sampler,
566
+ )
567
+
568
+ for trial_num in range(1, self.num_trials + 1):
569
+ trial = study.ask()
570
+
571
+ # Sample indices
572
+ instr_idx = trial.suggest_int("instr_idx", 0, num_instructions - 1)
573
+ demo_idx = trial.suggest_int("demo_idx", 0, num_demo_sets - 1)
574
+
575
+ # Get config and demos
576
+ config = self._get_config_by_index(instr_idx)
577
+ demo_set = self._demo_sets[demo_idx]
578
+ rendered_config = self._render_config_with_demos(config, demo_set)
579
+
580
+ # Score on minibatch
581
+ minibatch = self._draw_minibatch(goldens)
582
+ score = await self.scorer.a_score_minibatch(
583
+ rendered_config, minibatch
584
+ )
585
+
586
+ # Track scores
587
+ combo_key = (instr_idx, demo_idx)
588
+ if combo_key not in self._combination_scores:
589
+ self._combination_scores[combo_key] = []
590
+ self._combination_scores[combo_key].append(score)
591
+
592
+ # Update best
593
+ if score > self._best_trial_score:
594
+ self._best_trial_score = score
595
+ self._best_trial_key = combo_key
596
+
597
+ # Record trial
598
+ self._trial_history.append(
599
+ {
600
+ "trial": trial_num,
601
+ "instr_idx": instr_idx,
602
+ "demo_idx": demo_idx,
603
+ "score": score,
604
+ }
605
+ )
606
+
607
+ # Tell Optuna the result
608
+ study.tell(trial, score)
609
+
610
+ # Progress update
611
+ demo_info = (
612
+ f"{len(demo_set.demos)} demos" if demo_set.demos else "0-shot"
613
+ )
614
+ self._update_status(
615
+ f"Trial {trial_num}/{self.num_trials} - "
616
+ f"Instr {instr_idx}, {demo_info} - Score: {score:.4f}",
617
+ trial_num,
618
+ )
619
+
620
+ # Periodic full evaluation
621
+ if trial_num % self.minibatch_full_eval_steps == 0:
622
+ best_instr, best_demo = self._best_trial_key
623
+ if (best_instr, best_demo) not in self._full_eval_cache:
624
+ best_config = self._get_config_by_index(best_instr)
625
+ best_demo_set = self._demo_sets[best_demo]
626
+ await self._a_full_evaluate(
627
+ best_config, best_demo_set, goldens
628
+ )
629
+
630
+ return (
631
+ study.best_params["instr_idx"],
632
+ study.best_params["demo_idx"],
633
+ )
634
+
635
+ ############################
636
+ # Full Evaluation #
637
+ ############################
638
+
639
+ def _full_evaluate(
640
+ self,
641
+ config: PromptConfiguration,
642
+ demo_set: DemoSet,
643
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
644
+ ) -> None:
645
+ """Perform full evaluation on all goldens."""
646
+ # Find the indices for this combination
647
+ instr_idx = self._instruction_candidates.index(config)
648
+ demo_idx = self._demo_sets.index(demo_set)
649
+ combo_key = (instr_idx, demo_idx)
650
+
651
+ if combo_key in self._full_eval_cache:
652
+ return
653
+
654
+ # Render with demos
655
+ rendered_config = self._render_config_with_demos(config, demo_set)
656
+
657
+ # Register the rendered config
658
+ self.prompt_configurations_by_id[rendered_config.id] = rendered_config
659
+ self.parents_by_id[rendered_config.id] = config.id
660
+
661
+ # Score on full set
662
+ scores = self.scorer.score_pareto(rendered_config, goldens)
663
+ self.pareto_score_table[rendered_config.id] = scores
664
+
665
+ # Cache the result
666
+ self._full_eval_cache[combo_key] = rendered_config.id
667
+
668
+ async def _a_full_evaluate(
669
+ self,
670
+ config: PromptConfiguration,
671
+ demo_set: DemoSet,
672
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
673
+ ) -> None:
674
+ """Async full evaluation."""
675
+ instr_idx = self._instruction_candidates.index(config)
676
+ demo_idx = self._demo_sets.index(demo_set)
677
+ combo_key = (instr_idx, demo_idx)
678
+
679
+ if combo_key in self._full_eval_cache:
680
+ return
681
+
682
+ rendered_config = self._render_config_with_demos(config, demo_set)
683
+ self.prompt_configurations_by_id[rendered_config.id] = rendered_config
684
+ self.parents_by_id[rendered_config.id] = config.id
685
+
686
+ scores = await self.scorer.a_score_pareto(rendered_config, goldens)
687
+ self.pareto_score_table[rendered_config.id] = scores
688
+ self._full_eval_cache[combo_key] = rendered_config.id
689
+
690
+ ############################
691
+ # Result Building #
692
+ ############################
693
+
694
+ def _best_by_aggregate(self) -> PromptConfiguration:
695
+ """Return the best candidate based on full evaluation scores."""
696
+ if not self.pareto_score_table:
697
+ # Fall back to best by trial scores
698
+ best_instr, best_demo = self._best_trial_key
699
+ config = self._get_config_by_index(best_instr)
700
+ demo_set = self._demo_sets[best_demo]
701
+ return self._render_config_with_demos(config, demo_set)
702
+
703
+ best_id: Optional[PromptConfigurationId] = None
704
+ best_score = float("-inf")
705
+
706
+ for config_id, scores in self.pareto_score_table.items():
707
+ agg_score = self.aggregate_instances(scores)
708
+ if agg_score > best_score:
709
+ best_score = agg_score
710
+ best_id = config_id
711
+
712
+ if best_id is None:
713
+ best_instr, best_demo = self._best_trial_key
714
+ config = self._get_config_by_index(best_instr)
715
+ demo_set = self._demo_sets[best_demo]
716
+ return self._render_config_with_demos(config, demo_set)
717
+
718
+ return self.prompt_configurations_by_id[best_id]
719
+
720
+ def _build_result(
721
+ self,
722
+ best: PromptConfiguration,
723
+ ) -> Tuple[Prompt, OptimizationReport]:
724
+ """Build the optimization result."""
725
+ prompt_config_snapshots = build_prompt_config_snapshots(
726
+ self.prompt_configurations_by_id
727
+ )
728
+
729
+ report = OptimizationReport(
730
+ optimization_id=self.optimization_id,
731
+ best_id=best.id,
732
+ accepted_iterations=self._trial_history,
733
+ pareto_scores=self.pareto_score_table,
734
+ parents=self.parents_by_id,
735
+ prompt_configurations=prompt_config_snapshots,
736
+ )
737
+
738
+ return best.prompts[self.SINGLE_MODULE_ID], report
739
+
740
+ ############################
741
+ # Status Updates #
742
+ ############################
743
+
744
+ def _update_status(self, message: str, step: int) -> None:
745
+ """Send status update via callback."""
746
+ if self.status_callback is not None:
747
+ self.status_callback(
748
+ RunnerStatusType.PROGRESS,
749
+ step_index=step,
750
+ total_steps=self.num_trials,
751
+ detail=message,
752
+ )