deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,785 +0,0 @@
1
- # - MIPROv2 0-shot variant:
2
- # - Works on a single set of goldens (no D_pareto split).
3
- # - Maintains an unbounded pool of candidate prompts.
4
- # - At each iteration:
5
- # - Select a parent via epsilon-greedy on mean minibatch score.
6
- # - Propose a single child prompt via PromptRewriter.
7
- # - Accept the child if its minibatch score improves on the parent
8
- # by at least `min_delta`, and add it to the pool.
9
- # - Uses `full_eval_every` (if set) to periodically re-score the current
10
- # best candidate on the full golden set.
11
-
12
-
13
- from __future__ import annotations
14
- import uuid
15
- import random
16
- import time
17
-
18
- from typing import (
19
- Awaitable,
20
- Callable,
21
- Dict,
22
- List,
23
- Tuple,
24
- TYPE_CHECKING,
25
- Union,
26
- Optional,
27
- )
28
-
29
- from deepeval.errors import DeepEvalError
30
- from deepeval.optimization.aggregates import Aggregator, mean_of_all
31
- from deepeval.optimization.types import (
32
- AcceptedIterationDict,
33
- PromptConfiguration,
34
- PromptConfigurationId,
35
- ModuleId,
36
- ScoreTable,
37
- ScoringAdapter,
38
- OptimizationResult,
39
- RunnerStatusType,
40
- RunnerStatusCallbackProtocol,
41
- )
42
- from deepeval.optimization.utils import (
43
- build_prompt_config_snapshots,
44
- )
45
- from deepeval.prompt.api import PromptType
46
- from deepeval.prompt.prompt import Prompt
47
- from deepeval.optimization.mutations.prompt_rewriter import PromptRewriter
48
- from .configs import MIPROConfig
49
-
50
- if TYPE_CHECKING:
51
- from deepeval.dataset.golden import Golden, ConversationalGolden
52
-
53
-
54
- class MIPRORunner:
55
- """
56
- 0-shot MIPRO-style loop with sync/async execution.
57
-
58
- This runner is intentionally low level and does not know about metrics,
59
- models, or async configs. It relies on a preconfigured ScoringAdapter and
60
- PromptRewriter, which are typically constructed by PromptOptimizer.
61
-
62
- - Optimizes a single Prompt (instruction) against a list of Goldens.
63
- - Uses mini-batches of goldens for trial scoring and simple selection over
64
- prompt candidates based on mean minibatch scores instead of a full TPE
65
- implementation.
66
- """
67
-
68
- SINGLE_MODULE_ID: ModuleId = "__module__"
69
-
70
- def __init__(
71
- self,
72
- *,
73
- config: MIPROConfig,
74
- aggregate_instances: Aggregator = mean_of_all,
75
- scoring_adapter: Optional[ScoringAdapter] = None,
76
- ) -> None:
77
- self.config = config
78
- self.aggregate_instances = aggregate_instances
79
- self.scoring_adapter = scoring_adapter
80
-
81
- # Random seeded from config is used for minibatch sampling and candidate selection.
82
- self.random_state = random.Random(config.random_seed)
83
-
84
- # Runtime state to be reset between runs
85
- self.reset_state()
86
-
87
- # Status callback set by PromptOptimizer:
88
- # (kind, step_index, total_steps, detail) -> None
89
- self.status_callback: Optional[RunnerStatusCallbackProtocol] = None
90
-
91
- # Model callback used by the rewriter set by PromptOptimizer.
92
- self.model_callback: Optional[
93
- Callable[
94
- ...,
95
- Union[
96
- str,
97
- Dict,
98
- Tuple[Union[str, Dict], float],
99
- ],
100
- ]
101
- ] = None
102
-
103
- # Lazy-loaded PromptRewriter (can be overridden by PromptOptimizer)
104
- self._rewriter: Optional[PromptRewriter] = None
105
-
106
- ##############
107
- # Public API #
108
- ##############
109
-
110
- def execute(
111
- self,
112
- *,
113
- prompt: Prompt,
114
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
115
- ) -> Tuple[Prompt, Dict]:
116
- """
117
- Synchronous MIPRO run from a full list of goldens.
118
-
119
- The full goldens set is used both for mini-batched scoring during
120
- optimization and for a final full evaluation of the best candidate.
121
- """
122
- total_goldens = len(goldens)
123
- if total_goldens < 1:
124
- raise DeepEvalError(
125
- "MIPRO prompt optimization requires at least 1 golden, but "
126
- f"received {total_goldens}. Provide at least one golden to run "
127
- "the optimizer."
128
- )
129
-
130
- self._ensure_scoring_adapter()
131
- self._ensure_rewriter()
132
- self.reset_state()
133
-
134
- # Seed candidate pool with the root prompt configuration.
135
- seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
136
- root_prompt_configuration = PromptConfiguration.new(
137
- prompts=dict(seed_prompts_by_module)
138
- )
139
- # Add root candidate to the pool, but defer its first minibatch
140
- # evaluation until the first iteration so that any long running
141
- # model calls happen under the main loop (with progress updates).
142
- self._add_prompt_configuration(root_prompt_configuration)
143
-
144
- accepted_iterations: List[Dict] = []
145
- self.trial_index = 0
146
-
147
- def _one_iteration() -> bool:
148
- nonlocal accepted_iterations
149
-
150
- if not goldens:
151
- return False
152
-
153
- # Lazily seed with a minibatch score for the root
154
- # candidate on the first iteration.
155
- if not self._minibatch_score_counts:
156
- seed_minibatch = self._draw_minibatch(goldens)
157
- root_score = self.scoring_adapter.minibatch_score(
158
- root_prompt_configuration, seed_minibatch
159
- )
160
- self._record_minibatch_score(
161
- root_prompt_configuration.id, root_score
162
- )
163
-
164
- # 1 Choose which candidate prompt to mutate
165
- parent_prompt_configuration = self._select_candidate()
166
- selected_module_id: ModuleId = self.SINGLE_MODULE_ID
167
-
168
- minibatch = self._draw_minibatch(goldens)
169
-
170
- feedback_text = self.scoring_adapter.minibatch_feedback(
171
- parent_prompt_configuration, selected_module_id, minibatch
172
- )
173
-
174
- # 2. Generate a child prompt
175
- child_prompt = self._generate_child_prompt(
176
- selected_module_id,
177
- parent_prompt_configuration,
178
- feedback_text,
179
- )
180
- if child_prompt is None:
181
- # Treat as a no-op iteration (counts towards budget).
182
- self.trial_index += 1
183
- return True
184
-
185
- child_prompt_configuration = self._make_child(
186
- selected_module_id,
187
- parent_prompt_configuration,
188
- child_prompt,
189
- )
190
-
191
- child_score = self.scoring_adapter.minibatch_score(
192
- child_prompt_configuration, minibatch
193
- )
194
-
195
- before_mean = self._mean_minibatch_score(
196
- parent_prompt_configuration.id
197
- )
198
-
199
- # 3. Evaluate & decide whether to accept the child
200
- jitter = 1e-6
201
- if child_score >= before_mean + max(self.config.min_delta, jitter):
202
- # Accept: add to pool, update minibatch-score history for this candidate,
203
- # and record the iteration.
204
- self._add_prompt_configuration(child_prompt_configuration)
205
- self._record_minibatch_score(
206
- child_prompt_configuration.id, child_score
207
- )
208
-
209
- accepted_iterations.append(
210
- AcceptedIterationDict(
211
- parent=parent_prompt_configuration.id,
212
- child=child_prompt_configuration.id,
213
- module=selected_module_id,
214
- before=before_mean,
215
- after=child_score,
216
- )
217
- )
218
- # else: reject; do not add child to the candidate pool
219
-
220
- self.trial_index += 1
221
- if (
222
- self.config.full_eval_every is not None
223
- and self.trial_index % self.config.full_eval_every == 0
224
- ):
225
- self._full_evaluate_best(goldens)
226
-
227
- return True
228
-
229
- self._run_loop_iteration(_one_iteration)
230
-
231
- # Ensure at least one candidate has been fully evaluated.
232
- if not self.pareto_score_table:
233
- self._full_evaluate_best(goldens)
234
-
235
- best = self._best_by_aggregate()
236
- prompt_config_snapshots = build_prompt_config_snapshots(
237
- self.prompt_configurations_by_id
238
- )
239
- report = OptimizationResult(
240
- optimization_id=self.optimization_id,
241
- best_id=best.id,
242
- accepted_iterations=accepted_iterations,
243
- pareto_scores=self.pareto_score_table,
244
- parents=self.parents_by_id,
245
- prompt_configurations=prompt_config_snapshots,
246
- )
247
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
248
-
249
- async def a_execute(
250
- self,
251
- *,
252
- prompt: Prompt,
253
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
254
- ) -> Tuple[Prompt, Dict]:
255
- """
256
- Asynchronous twin of execute().
257
- """
258
- total_goldens = len(goldens)
259
- if total_goldens < 1:
260
- raise DeepEvalError(
261
- "MIPRO prompt optimization requires at least 1 golden, but "
262
- f"received {total_goldens}. Provide at least one golden to run "
263
- "the optimizer."
264
- )
265
-
266
- self._ensure_scoring_adapter()
267
- self._ensure_rewriter()
268
- self.reset_state()
269
-
270
- seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
271
- root_prompt_configuration = PromptConfiguration.new(
272
- prompts=dict(seed_prompts_by_module)
273
- )
274
- # Add root candidate to the pool, but defer its first minibatch
275
- # evaluation until the first iteration so that any long running
276
- # model calls happen under the main loop (with progress updates).
277
- self._add_prompt_configuration(root_prompt_configuration)
278
-
279
- accepted_iterations: List[Dict] = []
280
- self.trial_index = 0
281
-
282
- async def _one_iteration() -> bool:
283
- nonlocal accepted_iterations
284
-
285
- if not goldens:
286
- return False
287
-
288
- # Lazily seed with a minibatch score for the root
289
- # candidate on the first iteration.
290
- if not self._minibatch_score_counts:
291
- seed_minibatch = self._draw_minibatch(goldens)
292
- root_score = await self.scoring_adapter.a_minibatch_score(
293
- root_prompt_configuration, seed_minibatch
294
- )
295
- self._record_minibatch_score(
296
- root_prompt_configuration.id, root_score
297
- )
298
-
299
- parent_prompt_configuration = self._select_candidate()
300
- selected_module_id: ModuleId = self.SINGLE_MODULE_ID
301
-
302
- minibatch = self._draw_minibatch(goldens)
303
-
304
- feedback_text = await self.scoring_adapter.a_minibatch_feedback(
305
- parent_prompt_configuration, selected_module_id, minibatch
306
- )
307
-
308
- child_prompt = await self._a_generate_child_prompt(
309
- selected_module_id,
310
- parent_prompt_configuration,
311
- feedback_text,
312
- )
313
- if child_prompt is None:
314
- self.trial_index += 1
315
- return True
316
-
317
- child_prompt_configuration = self._make_child(
318
- selected_module_id,
319
- parent_prompt_configuration,
320
- child_prompt,
321
- )
322
-
323
- child_score = await self.scoring_adapter.a_minibatch_score(
324
- child_prompt_configuration, minibatch
325
- )
326
-
327
- before_mean = self._mean_minibatch_score(
328
- parent_prompt_configuration.id
329
- )
330
-
331
- # 3. Evaluate & decide whether to accept the child
332
- jitter = 1e-6
333
- if child_score >= before_mean + max(self.config.min_delta, jitter):
334
- # Accept: add to pool, update minibatch-score history for this candidate,
335
- # and record the iteration.
336
- self._add_prompt_configuration(child_prompt_configuration)
337
- self._record_minibatch_score(
338
- child_prompt_configuration.id, child_score
339
- )
340
-
341
- accepted_iterations.append(
342
- AcceptedIterationDict(
343
- parent=parent_prompt_configuration.id,
344
- child=child_prompt_configuration.id,
345
- module=selected_module_id,
346
- before=before_mean,
347
- after=child_score,
348
- )
349
- )
350
- # else: reject; do not add child to the candidate pool
351
-
352
- self.trial_index += 1
353
- if (
354
- self.config.full_eval_every is not None
355
- and self.trial_index % self.config.full_eval_every == 0
356
- ):
357
- await self._a_full_evaluate_best(goldens)
358
-
359
- return True
360
-
361
- await self._a_run_loop_iteration(_one_iteration)
362
-
363
- if not self.pareto_score_table:
364
- await self._a_full_evaluate_best(goldens)
365
-
366
- best = self._best_by_aggregate()
367
- prompt_config_snapshots = build_prompt_config_snapshots(
368
- self.prompt_configurations_by_id
369
- )
370
- report = OptimizationResult(
371
- optimization_id=self.optimization_id,
372
- best_id=best.id,
373
- accepted_iterations=accepted_iterations,
374
- pareto_scores=self.pareto_score_table,
375
- parents=self.parents_by_id,
376
- prompt_configurations=prompt_config_snapshots,
377
- )
378
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
379
-
380
- ###################
381
- # State & helpers #
382
- ###################
383
-
384
- def reset_state(self) -> None:
385
- self.optimization_id = str(uuid.uuid4())
386
- self.prompt_configurations_by_id: Dict[
387
- PromptConfigurationId, PromptConfiguration
388
- ] = {}
389
- self.parents_by_id: Dict[
390
- PromptConfigurationId, Optional[PromptConfigurationId]
391
- ] = {}
392
- # For MIPRO we reuse the same field name as GEPA for full-eval scores.
393
- self.pareto_score_table: ScoreTable = {}
394
-
395
- # Surrogate stats: running mean minibatch scores per candidate.
396
- self._minibatch_score_sums: Dict[PromptConfigurationId, float] = {}
397
- self._minibatch_score_counts: Dict[PromptConfigurationId, int] = {}
398
-
399
- # Trial counter (used for full_eval_every).
400
- self.trial_index: int = 0
401
-
402
- def _ensure_scoring_adapter(self) -> None:
403
- if self.scoring_adapter is None:
404
- raise DeepEvalError(
405
- "MIPRORunner requires a `scoring_adapter`. "
406
- "Construct one (for example, DeepEvalScoringAdapter) in "
407
- "PromptOptimizer and assign it to `runner.scoring_adapter`."
408
- )
409
-
410
- def _ensure_rewriter(self) -> None:
411
- if self._rewriter is not None:
412
- return
413
-
414
- # Default basic PromptRewriter; PromptOptimizer can override this and
415
- # pass a configured instance
416
- self._rewriter = PromptRewriter(
417
- max_chars=self.config.rewrite_instruction_max_chars,
418
- random_state=self.random_state,
419
- )
420
-
421
- def _prompts_equivalent(
422
- self,
423
- old_prompt: Prompt,
424
- new_prompt: Prompt,
425
- ) -> bool:
426
- """
427
- Compare two Prompts for optimization purposes.
428
-
429
- We treat a child as "no change" if:
430
- - The types differ, or
431
- - For TEXT: trimmed text_template matches.
432
- - For LIST: messages_template length, roles, and trimmed content match.
433
- """
434
-
435
- if new_prompt.type == PromptType.LIST:
436
- old_msgs = old_prompt.messages_template
437
- new_msgs = new_prompt.messages_template
438
- if len(old_msgs) != len(new_msgs):
439
- return False
440
-
441
- for old_msg, new_msg in zip(old_msgs, new_msgs):
442
- if old_msg.role != new_msg.role:
443
- return False
444
- if (old_msg.content or "").strip() != (
445
- new_msg.content or ""
446
- ).strip():
447
- return False
448
-
449
- return True
450
-
451
- old_txt = (old_prompt.text_template or "").strip()
452
- new_txt = (new_prompt.text_template or "").strip()
453
- return new_txt == old_txt
454
-
455
- def _add_prompt_configuration(
456
- self,
457
- prompt_configuration: PromptConfiguration,
458
- ) -> None:
459
- self.prompt_configurations_by_id[prompt_configuration.id] = (
460
- prompt_configuration
461
- )
462
- self.parents_by_id[prompt_configuration.id] = (
463
- prompt_configuration.parent
464
- )
465
-
466
- def _record_minibatch_score(
467
- self,
468
- prompt_configuration_id: PromptConfigurationId,
469
- score: float,
470
- ) -> None:
471
- self._minibatch_score_sums[prompt_configuration_id] = (
472
- self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
473
- + float(score)
474
- )
475
- self._minibatch_score_counts[prompt_configuration_id] = (
476
- self._minibatch_score_counts.get(prompt_configuration_id, 0) + 1
477
- )
478
-
479
- def _mean_minibatch_score(
480
- self,
481
- prompt_configuration_id: PromptConfigurationId,
482
- ) -> float:
483
- total = self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
484
- count = self._minibatch_score_counts.get(prompt_configuration_id, 0)
485
- if count <= 0:
486
- # Use a sentinel that will not dominate selection if a scored
487
- # candidate exists. Root is scored lazily on first iteration.
488
- return float("-inf")
489
- return total / count
490
-
491
- def _best_by_minibatch(self) -> PromptConfiguration:
492
- """
493
- Return the candidate with the highest mean minibatch score.
494
- """
495
- if not self.prompt_configurations_by_id:
496
- raise DeepEvalError(
497
- "MIPRORunner has no prompt configurations; this should not happen."
498
- )
499
-
500
- best_id: Optional[PromptConfigurationId] = None
501
- best_score = float("-inf")
502
-
503
- for cand_id in self.prompt_configurations_by_id.keys():
504
- mean_score = self._mean_minibatch_score(cand_id)
505
- if mean_score > best_score:
506
- best_score = mean_score
507
- best_id = cand_id
508
-
509
- if best_id is None:
510
- # Fallback to the first candidate if all means are -inf.
511
- best_id = next(iter(self.prompt_configurations_by_id.keys()))
512
-
513
- return self.prompt_configurations_by_id[best_id]
514
-
515
- def _best_by_aggregate(self) -> PromptConfiguration:
516
- """
517
- Return the best candidate based on full evaluation scores.
518
-
519
- If no full eval scores are available (should be rare, but possible if
520
- full_eval_every is very large and the loop exits early), fall back to
521
- best-by-minibatch.
522
- """
523
- if not self.pareto_score_table:
524
- return self._best_by_minibatch()
525
-
526
- totals = {
527
- prompt_configuration_id: self.aggregate_instances(vector)
528
- for prompt_configuration_id, vector in self.pareto_score_table.items()
529
- }
530
-
531
- best_ids: List[PromptConfigurationId] = []
532
- best_val = float("-inf")
533
-
534
- for cand_id, agg in totals.items():
535
- if agg > best_val + 1e-12:
536
- best_val = agg
537
- best_ids = [cand_id]
538
- elif abs(agg - best_val) <= 1e-12:
539
- best_ids.append(cand_id)
540
-
541
- chosen_id = self.random_state.choice(best_ids)
542
- return self.prompt_configurations_by_id[chosen_id]
543
-
544
- def _select_candidate(self) -> PromptConfiguration:
545
- """
546
- Epsilon-greedy candidate selection:
547
-
548
- - With probability `exploration_probability`, pick a random candidate.
549
- - Otherwise, pick the candidate with the highest mean minibatch score.
550
- """
551
- if not self.prompt_configurations_by_id:
552
- raise DeepEvalError(
553
- "MIPRORunner has no prompt configurations to select from."
554
- )
555
-
556
- candidate_ids = list(self.prompt_configurations_by_id.keys())
557
- if not candidate_ids:
558
- raise DeepEvalError(
559
- "MIPRORunner has an empty candidate pool; this should not happen."
560
- )
561
-
562
- eps = float(self.config.exploration_probability)
563
- if eps > 0.0 and self.random_state.random() < eps:
564
- chosen_id = self.random_state.choice(candidate_ids)
565
- else:
566
- chosen_id = self._best_by_minibatch().id
567
-
568
- return self.prompt_configurations_by_id[chosen_id]
569
-
570
- def _draw_minibatch(
571
- self,
572
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
573
- ) -> Union[List["Golden"], List["ConversationalGolden"]]:
574
- """
575
- Determine effective minibatch size from MIPROConfig, bounded by the
576
- available goldens, and sample with replacement.
577
- """
578
- n = len(goldens)
579
- if n <= 0:
580
- return []
581
-
582
- if self.config.minibatch_size is not None:
583
- size = self.config.minibatch_size
584
- else:
585
- dynamic = max(1, int(round(n * self.config.minibatch_ratio)))
586
- size = max(
587
- self.config.minibatch_min_size,
588
- min(dynamic, self.config.minibatch_max_size),
589
- )
590
-
591
- size = max(1, min(size, n))
592
-
593
- return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
594
-
595
- async def _a_full_evaluate_best(
596
- self,
597
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
598
- ) -> None:
599
- if not self.prompt_configurations_by_id:
600
- return
601
-
602
- best = self._best_by_minibatch()
603
- if best.id in self.pareto_score_table:
604
- return
605
-
606
- scores = await self.scoring_adapter.a_score_on_pareto(best, goldens)
607
- self.pareto_score_table[best.id] = scores
608
-
609
- def _full_evaluate_best(
610
- self,
611
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
612
- ) -> None:
613
- if not self.prompt_configurations_by_id:
614
- return
615
-
616
- best = self._best_by_minibatch()
617
- if best.id in self.pareto_score_table:
618
- return
619
-
620
- scores = self.scoring_adapter.score_on_pareto(best, goldens)
621
- self.pareto_score_table[best.id] = scores
622
-
623
- async def _a_generate_child_prompt(
624
- self,
625
- selected_module_id: ModuleId,
626
- parent_prompt_configuration: PromptConfiguration,
627
- feedback_text: str,
628
- ) -> Optional[Prompt]:
629
- try:
630
- old_prompt = parent_prompt_configuration.prompts[selected_module_id]
631
- except KeyError as exc:
632
- raise DeepEvalError(
633
- f"MIPRORunner expected a prompt for module_id "
634
- f"{selected_module_id!r} but none was found in the "
635
- "current prompt configuration."
636
- ) from exc
637
-
638
- new_prompt = await self._rewriter.a_rewrite(
639
- model_callback=self.model_callback,
640
- module_id=selected_module_id,
641
- old_prompt=old_prompt,
642
- feedback_text=feedback_text,
643
- )
644
-
645
- if old_prompt.type != new_prompt.type or self._prompts_equivalent(
646
- old_prompt, new_prompt
647
- ):
648
- # Don't accept if new prompt is the same as parent, or if type changed.
649
- return None
650
- return new_prompt
651
-
652
- def _generate_child_prompt(
653
- self,
654
- selected_module_id: ModuleId,
655
- parent_prompt_configuration: PromptConfiguration,
656
- feedback_text: str,
657
- ) -> Optional[Prompt]:
658
- try:
659
- old_prompt = parent_prompt_configuration.prompts[selected_module_id]
660
- except KeyError as exc:
661
- # this should never happen
662
- raise DeepEvalError(
663
- f"MIPRORunner expected a prompt for module_id "
664
- f"{selected_module_id!r} but none was found in the "
665
- "current prompt configuration."
666
- ) from exc
667
-
668
- new_prompt = self._rewriter.rewrite(
669
- model_callback=self.model_callback,
670
- module_id=selected_module_id,
671
- old_prompt=old_prompt,
672
- feedback_text=feedback_text,
673
- )
674
-
675
- if old_prompt.type != new_prompt.type or self._prompts_equivalent(
676
- old_prompt, new_prompt
677
- ):
678
- # Don't accept if new prompt is the same as parent, or if type changed.
679
- return None
680
- return new_prompt
681
-
682
- def _make_child(
683
- self,
684
- selected_module_id: ModuleId,
685
- parent_prompt_configuration: PromptConfiguration,
686
- child_prompt: Prompt,
687
- ) -> PromptConfiguration:
688
- child_prompt_configuration = PromptConfiguration.new(
689
- prompts=dict(parent_prompt_configuration.prompts),
690
- parent=parent_prompt_configuration.id,
691
- )
692
- child_prompt_configuration.prompts[selected_module_id] = child_prompt
693
- return child_prompt_configuration
694
-
695
- def _update_progress(
696
- self,
697
- total_iterations: int,
698
- iteration: int,
699
- remaining_iterations: int,
700
- elapsed: float,
701
- ):
702
- if self.status_callback is not None:
703
- detail = (
704
- f"(iterations={total_iterations}) "
705
- f"• iteration {iteration}/{total_iterations} "
706
- f"• {elapsed:.2f}s • remaining={remaining_iterations}"
707
- )
708
- self.status_callback(
709
- RunnerStatusType.PROGRESS,
710
- step_index=iteration,
711
- total_steps=total_iterations,
712
- detail=detail,
713
- )
714
-
715
- def _update_error(
716
- self,
717
- total_iterations: int,
718
- iteration: int,
719
- exc: Exception,
720
- ):
721
- # Report a user-facing error event.
722
- if self.status_callback is not None:
723
- detail = (
724
- f"(iterations={total_iterations}) "
725
- f"• error {exc.__class__.__name__}: {exc} "
726
- f"• halted at iteration {iteration}"
727
- )
728
- self.status_callback(
729
- RunnerStatusType.ERROR,
730
- step_index=iteration,
731
- total_steps=total_iterations,
732
- detail=detail,
733
- )
734
-
735
- def _run_loop_iteration(
736
- self,
737
- mipro_iteration: Callable[[], bool],
738
- ) -> None:
739
- total_iterations = self.config.iterations
740
- remaining_iterations = total_iterations
741
- iteration = 0
742
- self._update_progress(
743
- total_iterations, iteration, remaining_iterations, 0.0
744
- )
745
- while remaining_iterations > 0:
746
- iteration += 1
747
- start_time = time.perf_counter()
748
- try:
749
- ok = mipro_iteration()
750
- except Exception as exc:
751
- self._update_error(total_iterations, iteration, exc)
752
- break
753
- elapsed = time.perf_counter() - start_time
754
- if not ok:
755
- break
756
- remaining_iterations -= 1
757
- self._update_progress(
758
- total_iterations, iteration, remaining_iterations, elapsed
759
- )
760
-
761
- async def _a_run_loop_iteration(
762
- self,
763
- a_mipro_iteration: Callable[[], Awaitable[bool]],
764
- ) -> None:
765
- total_iterations = self.config.iterations
766
- remaining_iterations = total_iterations
767
- iteration = 0
768
- self._update_progress(
769
- total_iterations, iteration, remaining_iterations, 0.0
770
- )
771
- while remaining_iterations > 0:
772
- iteration += 1
773
- start_time = time.perf_counter()
774
- try:
775
- ok = await a_mipro_iteration()
776
- except Exception as exc:
777
- self._update_error(total_iterations, iteration, exc)
778
- break
779
- elapsed = time.perf_counter() - start_time
780
- if not ok:
781
- break
782
- remaining_iterations -= 1
783
- self._update_progress(
784
- total_iterations, iteration, remaining_iterations, elapsed
785
- )