deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,836 @@
1
+ # - COPRO cooperative 0-shot variant:
2
+ # - Works on a single set of goldens (no D_pareto split).
3
+ # - Maintains a bounded population of candidate prompts
4
+ # (size controlled by `population_size`).
5
+ # - At each iteration:
6
+ # - Select a parent via epsilon-greedy on mean minibatch score.
7
+ # - Sample a minibatch of goldens for scoring.
8
+ # - Compute feedback once for the parent + minibatch.
9
+ # - Propose multiple child prompts cooperatively from the same parent
10
+ # (up to `proposals_per_step` children).
11
+ # - For each child, accept it if its minibatch score improves on the
12
+ # parent by at least `min_delta`, add it to the pool, and prune
13
+ # low-scoring candidates if the population exceeds `population_size`.
14
+ # - Uses `full_eval_every` (if set) to periodically re-score the current
15
+ # best candidate on the full golden set.
16
+
17
+ from __future__ import annotations
18
+
19
+ import random
20
+ import time
21
+ import uuid
22
+ from typing import (
23
+ TYPE_CHECKING,
24
+ Awaitable,
25
+ Callable,
26
+ Dict,
27
+ List,
28
+ Optional,
29
+ Tuple,
30
+ Union,
31
+ )
32
+
33
+ from deepeval.models.base_model import DeepEvalBaseLLM
34
+
35
+ from deepeval.errors import DeepEvalError
36
+ from deepeval.optimizer.utils import Aggregator, mean_of_all
37
+ from deepeval.optimizer.types import (
38
+ AcceptedIterationDict,
39
+ ModuleId,
40
+ OptimizationReport,
41
+ PromptConfiguration,
42
+ PromptConfigurationId,
43
+ RunnerStatusCallback,
44
+ RunnerStatusType,
45
+ ScoreTable,
46
+ )
47
+ from deepeval.optimizer.scorer.base import BaseScorer
48
+ from deepeval.optimizer.utils import (
49
+ build_prompt_config_snapshots,
50
+ )
51
+ from deepeval.prompt.api import PromptType
52
+ from deepeval.prompt.prompt import Prompt
53
+ from deepeval.optimizer.rewriter import Rewriter
54
+ from deepeval.optimizer.algorithms.configs import MIPROV2_MIN_DELTA
55
+ from deepeval.optimizer.algorithms.base import BaseAlgorithm
56
+
57
+ if TYPE_CHECKING: # pragma: no cover - type-checking only
58
+ from deepeval.dataset.golden import ConversationalGolden, Golden
59
+
60
+
61
+ class COPRO(BaseAlgorithm):
62
+ """
63
+ COPRO style cooperative prompt optimization loop with sync/async execution.
64
+
65
+ This runner is intentionally low level and does not know about metrics,
66
+ models, or async configs. It relies on a preconfigured Scorer and
67
+ Rewriter, which are typically constructed by PromptOptimizer.
68
+
69
+ Parameters
70
+ ----------
71
+ iterations : int
72
+ Total number of optimization trials. Default is 5.
73
+ minibatch_size : int
74
+ Number of examples drawn per iteration. Default is 8.
75
+ random_seed : int, optional
76
+ RNG seed for reproducibility. If None, derived from time.time_ns().
77
+ exploration_probability : float
78
+ Epsilon greedy exploration rate. Default is 0.2.
79
+ full_eval_every : int, optional
80
+ Fully evaluate best candidate every N trials. Default is 5.
81
+ population_size : int
82
+ Maximum number of candidates in the pool. Default is 4.
83
+ proposals_per_step : int
84
+ Number of child prompts proposed per iteration. Default is 4.
85
+ """
86
+
87
+ name = "COPRO"
88
+ SINGLE_MODULE_ID: ModuleId = "__module__"
89
+
90
+ def __init__(
91
+ self,
92
+ iterations: int = 5,
93
+ minibatch_size: int = 8,
94
+ random_seed: Optional[int] = None,
95
+ exploration_probability: float = 0.2,
96
+ full_eval_every: Optional[int] = 5,
97
+ population_size: int = 4,
98
+ proposals_per_step: int = 4,
99
+ aggregate_instances: Aggregator = mean_of_all,
100
+ scorer: Optional[BaseScorer] = None,
101
+ ) -> None:
102
+ # Validate parameters
103
+ if iterations < 1:
104
+ raise ValueError("iterations must be >= 1")
105
+ if minibatch_size < 1:
106
+ raise ValueError("minibatch_size must be >= 1")
107
+ if exploration_probability < 0.0 or exploration_probability > 1.0:
108
+ raise ValueError(
109
+ "exploration_probability must be >= 0.0 and <= 1.0"
110
+ )
111
+ if full_eval_every is not None and full_eval_every < 1:
112
+ raise ValueError("full_eval_every must be >= 1")
113
+ if population_size < 1:
114
+ raise ValueError("population_size must be >= 1")
115
+ if proposals_per_step < 1:
116
+ raise ValueError("proposals_per_step must be >= 1")
117
+
118
+ self.iterations = iterations
119
+ self.minibatch_size = minibatch_size
120
+ self.exploration_probability = exploration_probability
121
+ self.full_eval_every = full_eval_every
122
+ self.population_size = population_size
123
+ self.proposals_per_step = proposals_per_step
124
+ self.aggregate_instances = aggregate_instances
125
+ self.scorer = scorer
126
+
127
+ # If no seed provided, use time-based seed
128
+ if random_seed is None:
129
+ random_seed = time.time_ns()
130
+ self.random_seed = random_seed
131
+ self.random_state = random.Random(random_seed)
132
+
133
+ # Runtime state to be reset between runs
134
+ self.reset_state()
135
+
136
+ # Status callback set by PromptOptimizer:
137
+ # (kind, step_index, total_steps, detail) -> None
138
+ self.status_callback: Optional[RunnerStatusCallback] = None
139
+
140
+ # Optimizer model used by the rewriter for prompt mutation.
141
+ # Set by PromptOptimizer.
142
+ self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
143
+
144
+ # Lazy-loaded Rewriter set by PromptOptimizer
145
+ self._rewriter: Optional[Rewriter] = None
146
+
147
+ ##############
148
+ # Public API #
149
+ ##############
150
+
151
+ def execute(
152
+ self,
153
+ prompt: Prompt,
154
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
155
+ ) -> Tuple[Prompt, OptimizationReport]:
156
+ """
157
+ Synchronous COPRO run from a full list of goldens.
158
+
159
+ The full goldens set is used both for mini-batched scoring during
160
+ optimization and for a final full evaluation of the best candidate.
161
+ """
162
+ total_goldens = len(goldens)
163
+ if total_goldens < 1:
164
+ raise DeepEvalError(
165
+ "COPRO prompt optimization requires at least 1 golden, but "
166
+ f"received {total_goldens}. Provide at least one golden to run "
167
+ "the optimizer."
168
+ )
169
+
170
+ self._ensure_scorer()
171
+ self.reset_state()
172
+
173
+ # Seed candidate pool with the root prompt configuration.
174
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
175
+ root_prompt_configuration = PromptConfiguration.new(
176
+ prompts=dict(seed_prompts_by_module)
177
+ )
178
+ # Add root candidate to the pool, but defer its first minibatch
179
+ # evaluation until the first iteration so that any long running
180
+ # model calls happen under the main loop (with progress updates).
181
+ self._add_prompt_configuration(root_prompt_configuration)
182
+
183
+ accepted_iterations: List[Dict] = []
184
+ self.trial_index = 0
185
+
186
+ def _one_iteration() -> bool:
187
+ nonlocal accepted_iterations
188
+
189
+ if not goldens:
190
+ return False
191
+
192
+ # Lazily seed with a minibatch score for the root
193
+ # candidate on the first iteration.
194
+ if not self._minibatch_score_counts:
195
+ seed_minibatch = self._draw_minibatch(goldens)
196
+ root_score = self.scorer.score_minibatch(
197
+ root_prompt_configuration, seed_minibatch
198
+ )
199
+ self._record_minibatch_score(
200
+ root_prompt_configuration.id, root_score
201
+ )
202
+
203
+ # 1. Choose which candidate prompt to mutate.
204
+ parent_prompt_configuration = self._select_candidate()
205
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
206
+
207
+ minibatch = self._draw_minibatch(goldens)
208
+
209
+ # Compute shared feedback for this parent/minibatch that will be
210
+ # used by all cooperative child proposals.
211
+ feedback_text = self.scorer.get_minibatch_feedback(
212
+ parent_prompt_configuration, selected_module_id, minibatch
213
+ )
214
+
215
+ before_mean = self._mean_minibatch_score(
216
+ parent_prompt_configuration.id
217
+ )
218
+ jitter = 1e-6
219
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
220
+
221
+ # 2. Generate multiple cooperative child prompts and evaluate them.
222
+ num_proposals = int(self.proposals_per_step)
223
+ for _ in range(num_proposals):
224
+ child_prompt = self._generate_child_prompt(
225
+ selected_module_id,
226
+ parent_prompt_configuration,
227
+ feedback_text,
228
+ )
229
+ if child_prompt is None:
230
+ # No child, nothing more to do this iteration
231
+ continue
232
+
233
+ child_prompt_configuration = self._make_child(
234
+ selected_module_id,
235
+ parent_prompt_configuration,
236
+ child_prompt,
237
+ )
238
+
239
+ child_score = self.scorer.score_minibatch(
240
+ child_prompt_configuration, minibatch
241
+ )
242
+
243
+ # 3. Evaluate & decide whether to accept the child.
244
+ if child_score >= before_mean + min_delta:
245
+ # Accept: add to pool, update surrogate stats, and record iteration.
246
+ self._add_prompt_configuration(child_prompt_configuration)
247
+ self._record_minibatch_score(
248
+ child_prompt_configuration.id, child_score
249
+ )
250
+
251
+ accepted_iterations.append(
252
+ AcceptedIterationDict(
253
+ parent=parent_prompt_configuration.id,
254
+ child=child_prompt_configuration.id,
255
+ module=selected_module_id,
256
+ before=before_mean,
257
+ after=child_score,
258
+ )
259
+ )
260
+ # else: reject; do not add child to the candidate pool.
261
+
262
+ self.trial_index += 1
263
+ if (
264
+ self.full_eval_every is not None
265
+ and self.trial_index % self.full_eval_every == 0
266
+ ):
267
+ self._full_evaluate_best(goldens)
268
+
269
+ return True
270
+
271
+ self._run_loop_iteration(_one_iteration)
272
+
273
+ # Ensure at least one candidate has been fully evaluated.
274
+ if not self.pareto_score_table:
275
+ self._full_evaluate_best(goldens)
276
+
277
+ best = self._best_by_aggregate()
278
+ prompt_config_snapshots = build_prompt_config_snapshots(
279
+ self.prompt_configurations_by_id
280
+ )
281
+ report = OptimizationReport(
282
+ optimization_id=self.optimization_id,
283
+ best_id=best.id,
284
+ accepted_iterations=accepted_iterations,
285
+ pareto_scores=self.pareto_score_table,
286
+ parents=self.parents_by_id,
287
+ prompt_configurations=prompt_config_snapshots,
288
+ )
289
+ return best.prompts[self.SINGLE_MODULE_ID], report
290
+
291
+ async def a_execute(
292
+ self,
293
+ prompt: Prompt,
294
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
295
+ ) -> Tuple[Prompt, OptimizationReport]:
296
+ """
297
+ Asynchronous twin of execute().
298
+ """
299
+ total_goldens = len(goldens)
300
+ if total_goldens < 1:
301
+ raise DeepEvalError(
302
+ "COPRO prompt optimization requires at least 1 golden, but "
303
+ f"received {total_goldens}. Provide at least one golden to run "
304
+ "the optimizer."
305
+ )
306
+
307
+ self._ensure_scorer()
308
+ self.reset_state()
309
+
310
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
311
+ root_prompt_configuration = PromptConfiguration.new(
312
+ prompts=dict(seed_prompts_by_module)
313
+ )
314
+ # Add root candidate to the pool, but defer its first minibatch
315
+ # evaluation until the first iteration so that any long running
316
+ # model calls happen under the main loop (with progress updates).
317
+ self._add_prompt_configuration(root_prompt_configuration)
318
+
319
+ accepted_iterations: List[Dict] = []
320
+ self.trial_index = 0
321
+
322
+ async def _one_iteration() -> bool:
323
+ nonlocal accepted_iterations
324
+
325
+ if not goldens:
326
+ return False
327
+
328
+ # Lazily seed with a minibatch score for the root
329
+ # candidate on the first iteration.
330
+ if not self._minibatch_score_counts:
331
+ seed_minibatch = self._draw_minibatch(goldens)
332
+ root_score = await self.scorer.a_score_minibatch(
333
+ root_prompt_configuration, seed_minibatch
334
+ )
335
+ self._record_minibatch_score(
336
+ root_prompt_configuration.id, root_score
337
+ )
338
+
339
+ parent_prompt_configuration = self._select_candidate()
340
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
341
+
342
+ minibatch = self._draw_minibatch(goldens)
343
+
344
+ feedback_text = await self.scorer.a_get_minibatch_feedback(
345
+ parent_prompt_configuration, selected_module_id, minibatch
346
+ )
347
+
348
+ before_mean = self._mean_minibatch_score(
349
+ parent_prompt_configuration.id
350
+ )
351
+ jitter = 1e-6
352
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
353
+
354
+ num_proposals = int(self.proposals_per_step)
355
+ for _ in range(num_proposals):
356
+ child_prompt = await self._a_generate_child_prompt(
357
+ selected_module_id,
358
+ parent_prompt_configuration,
359
+ feedback_text,
360
+ )
361
+ if child_prompt is None:
362
+ continue
363
+
364
+ child_prompt_configuration = self._make_child(
365
+ selected_module_id,
366
+ parent_prompt_configuration,
367
+ child_prompt,
368
+ )
369
+
370
+ child_score = await self.scorer.a_score_minibatch(
371
+ child_prompt_configuration, minibatch
372
+ )
373
+
374
+ if child_score >= before_mean + min_delta:
375
+ self._add_prompt_configuration(child_prompt_configuration)
376
+ self._record_minibatch_score(
377
+ child_prompt_configuration.id, child_score
378
+ )
379
+
380
+ accepted_iterations.append(
381
+ AcceptedIterationDict(
382
+ parent=parent_prompt_configuration.id,
383
+ child=child_prompt_configuration.id,
384
+ module=selected_module_id,
385
+ before=before_mean,
386
+ after=child_score,
387
+ )
388
+ )
389
+
390
+ self.trial_index += 1
391
+ if (
392
+ self.full_eval_every is not None
393
+ and self.trial_index % self.full_eval_every == 0
394
+ ):
395
+ await self._a_full_evaluate_best(goldens)
396
+
397
+ return True
398
+
399
+ await self._a_run_loop_iteration(_one_iteration)
400
+
401
+ if not self.pareto_score_table:
402
+ await self._a_full_evaluate_best(goldens)
403
+
404
+ best = self._best_by_aggregate()
405
+ prompt_config_snapshots = build_prompt_config_snapshots(
406
+ self.prompt_configurations_by_id
407
+ )
408
+ report = OptimizationReport(
409
+ optimization_id=self.optimization_id,
410
+ best_id=best.id,
411
+ accepted_iterations=accepted_iterations,
412
+ pareto_scores=self.pareto_score_table,
413
+ parents=self.parents_by_id,
414
+ prompt_configurations=prompt_config_snapshots,
415
+ )
416
+ return best.prompts[self.SINGLE_MODULE_ID], report
417
+
418
+ ###################
419
+ # State & helpers #
420
+ ###################
421
+
422
+ def reset_state(self) -> None:
423
+ self.optimization_id = str(uuid.uuid4())
424
+ self.prompt_configurations_by_id: Dict[
425
+ PromptConfigurationId, PromptConfiguration
426
+ ] = {}
427
+ self.parents_by_id: Dict[
428
+ PromptConfigurationId, Optional[PromptConfigurationId]
429
+ ] = {}
430
+ # For COPRO we reuse the same field name as GEPA for full evaluation scores.
431
+ self.pareto_score_table: ScoreTable = {}
432
+
433
+ # Surrogate stats: running mean minibatch scores per candidate.
434
+ self._minibatch_score_sums: Dict[PromptConfigurationId, float] = {}
435
+ self._minibatch_score_counts: Dict[PromptConfigurationId, int] = {}
436
+
437
+ # Trial counter (used for full_eval_every).
438
+ self.trial_index: int = 0
439
+
440
+ def _ensure_scorer(self) -> None:
441
+ if self.scorer is None:
442
+ raise DeepEvalError(
443
+ "COPRORunner requires a `scorer`. "
444
+ "Construct one (for example, Scorer) in "
445
+ "PromptOptimizer and assign it to `runner.scorer`."
446
+ )
447
+
448
+ def _prompts_equivalent(
449
+ self,
450
+ old_prompt: Prompt,
451
+ new_prompt: Prompt,
452
+ ) -> bool:
453
+ """
454
+ Compare two Prompts for optimization purposes.
455
+
456
+ We treat a child as "no change" if:
457
+ - The types differ, or
458
+ - For TEXT: trimmed text_template matches.
459
+ - For LIST: messages_template length, roles, and trimmed content match.
460
+ """
461
+
462
+ if new_prompt.type == PromptType.LIST:
463
+ old_msgs = old_prompt.messages_template
464
+ new_msgs = new_prompt.messages_template
465
+ if len(old_msgs) != len(new_msgs):
466
+ return False
467
+
468
+ for old_msg, new_msg in zip(old_msgs, new_msgs):
469
+ if old_msg.role != new_msg.role:
470
+ return False
471
+ if (old_msg.content or "").strip() != (
472
+ new_msg.content or ""
473
+ ).strip():
474
+ return False
475
+
476
+ return True
477
+
478
+ old_txt = (old_prompt.text_template or "").strip()
479
+ new_txt = (new_prompt.text_template or "").strip()
480
+ return new_txt == old_txt
481
+
482
+ def _add_prompt_configuration(
483
+ self,
484
+ prompt_configuration: PromptConfiguration,
485
+ ) -> None:
486
+ """
487
+ Add a candidate to the active pool and, if a population limit is set,
488
+ prune the worst-scoring candidates to enforce it.
489
+ """
490
+ self.prompt_configurations_by_id[prompt_configuration.id] = (
491
+ prompt_configuration
492
+ )
493
+ self.parents_by_id[prompt_configuration.id] = (
494
+ prompt_configuration.parent
495
+ )
496
+
497
+ # If we exceed the population size, iteratively prune the worst
498
+ # (by mean minibatch score), never removing the current best.
499
+ while len(self.prompt_configurations_by_id) > self.population_size:
500
+ best_id: Optional[PromptConfigurationId] = None
501
+ best_score = float("-inf")
502
+ for cand_id in self.prompt_configurations_by_id.keys():
503
+ mean_score = self._mean_minibatch_score(cand_id)
504
+ if mean_score > best_score:
505
+ best_score = mean_score
506
+ best_id = cand_id
507
+
508
+ worst_id: Optional[PromptConfigurationId] = None
509
+ worst_score = float("inf")
510
+ for cand_id in self.prompt_configurations_by_id.keys():
511
+ if cand_id == best_id:
512
+ continue
513
+ mean_score = self._mean_minibatch_score(cand_id)
514
+ if mean_score < worst_score:
515
+ worst_score = mean_score
516
+ worst_id = cand_id
517
+
518
+ if worst_id is None or worst_id == best_id:
519
+ break
520
+
521
+ # Prune the chosen worst candidate from all bookkeeping tables.
522
+ self.prompt_configurations_by_id.pop(worst_id, None)
523
+ self.parents_by_id.pop(worst_id, None)
524
+ self._minibatch_score_sums.pop(worst_id, None)
525
+ self._minibatch_score_counts.pop(worst_id, None)
526
+ self.pareto_score_table.pop(worst_id, None)
527
+
528
+ def _record_minibatch_score(
529
+ self,
530
+ prompt_configuration_id: PromptConfigurationId,
531
+ score: float,
532
+ ) -> None:
533
+ self._minibatch_score_sums[prompt_configuration_id] = (
534
+ self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
535
+ + float(score)
536
+ )
537
+ self._minibatch_score_counts[prompt_configuration_id] = (
538
+ self._minibatch_score_counts.get(prompt_configuration_id, 0) + 1
539
+ )
540
+
541
+ def _mean_minibatch_score(
542
+ self,
543
+ prompt_configuration_id: PromptConfigurationId,
544
+ ) -> float:
545
+ total = self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
546
+ count = self._minibatch_score_counts.get(prompt_configuration_id, 0)
547
+ if count <= 0:
548
+ # Use a sentinel that will not dominate selection if a scored
549
+ # candidate exists. Root is seeded explicitly in the first iteration.
550
+ return float("-inf")
551
+ return total / count
552
+
553
+ def _best_by_minibatch(self) -> PromptConfiguration:
554
+ """
555
+ Return the candidate with the highest mean minibatch score.
556
+ """
557
+ if not self.prompt_configurations_by_id:
558
+ raise DeepEvalError(
559
+ "COPRORunner has no prompt configurations; this should not happen."
560
+ )
561
+
562
+ best_id: Optional[PromptConfigurationId] = None
563
+ best_score = float("-inf")
564
+
565
+ for cand_id in self.prompt_configurations_by_id.keys():
566
+ mean_score = self._mean_minibatch_score(cand_id)
567
+ if mean_score > best_score:
568
+ best_score = mean_score
569
+ best_id = cand_id
570
+
571
+ if best_id is None:
572
+ # Fallback to the first candidate if all means are -inf.
573
+ best_id = next(iter(self.prompt_configurations_by_id.keys()))
574
+
575
+ return self.prompt_configurations_by_id[best_id]
576
+
577
+ def _best_by_aggregate(self) -> PromptConfiguration:
578
+ """
579
+ Return the best candidate based on full-eval scores.
580
+
581
+ If no full evaluation scores are available (should be rare, but possible if
582
+ full_eval_every is very large and the loop exits early), fall back to
583
+ best-by-minibatch.
584
+ """
585
+ if not self.pareto_score_table:
586
+ return self._best_by_minibatch()
587
+
588
+ totals = {
589
+ prompt_configuration_id: self.aggregate_instances(vector)
590
+ for prompt_configuration_id, vector in self.pareto_score_table.items()
591
+ }
592
+
593
+ best_ids: List[PromptConfigurationId] = []
594
+ best_val = float("-inf")
595
+
596
+ for cand_id, aggregate in totals.items():
597
+ if aggregate > best_val + 1e-12:
598
+ best_val = aggregate
599
+ best_ids = [cand_id]
600
+ elif abs(aggregate - best_val) <= 1e-12:
601
+ best_ids.append(cand_id)
602
+
603
+ chosen_id = self.random_state.choice(best_ids)
604
+ return self.prompt_configurations_by_id[chosen_id]
605
+
606
+ def _select_candidate(self) -> PromptConfiguration:
607
+ """
608
+ Epsilon-greedy candidate selection:
609
+
610
+ - With probability ``exploration_probability``, pick a random candidate.
611
+ - Otherwise, pick the candidate with the highest mean minibatch score.
612
+ """
613
+ if not self.prompt_configurations_by_id:
614
+ raise DeepEvalError(
615
+ "COPRORunner has no prompt configurations to select from."
616
+ )
617
+
618
+ candidate_ids = list(self.prompt_configurations_by_id.keys())
619
+ if not candidate_ids:
620
+ raise DeepEvalError(
621
+ "COPRORunner has an empty candidate pool; this should not happen."
622
+ )
623
+
624
+ eps = float(self.exploration_probability)
625
+ if eps > 0.0 and self.random_state.random() < eps:
626
+ chosen_id = self.random_state.choice(candidate_ids)
627
+ else:
628
+ chosen_id = self._best_by_minibatch().id
629
+
630
+ return self.prompt_configurations_by_id[chosen_id]
631
+
632
+ def _draw_minibatch(
633
+ self,
634
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
635
+ ) -> Union[List["Golden"], List["ConversationalGolden"]]:
636
+ """
637
+ Determine effective minibatch size, bounded by the available goldens,
638
+ and sample with replacement.
639
+ """
640
+ n = len(goldens)
641
+ if n <= 0:
642
+ return []
643
+
644
+ size = min(self.minibatch_size, n)
645
+
646
+ return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
647
+
648
+ async def _a_full_evaluate_best(
649
+ self,
650
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
651
+ ) -> None:
652
+ if not self.prompt_configurations_by_id:
653
+ return
654
+
655
+ best = self._best_by_minibatch()
656
+ if best.id in self.pareto_score_table:
657
+ return
658
+
659
+ scores = await self.scorer.a_score_pareto(best, goldens)
660
+ self.pareto_score_table[best.id] = scores
661
+
662
+ def _full_evaluate_best(
663
+ self,
664
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
665
+ ) -> None:
666
+ if not self.prompt_configurations_by_id:
667
+ return
668
+
669
+ best = self._best_by_minibatch()
670
+ if best.id in self.pareto_score_table:
671
+ return
672
+
673
+ scores = self.scorer.score_pareto(best, goldens)
674
+ self.pareto_score_table[best.id] = scores
675
+
676
+ async def _a_generate_child_prompt(
677
+ self,
678
+ selected_module_id: ModuleId,
679
+ parent_prompt_configuration: PromptConfiguration,
680
+ feedback_text: str,
681
+ ) -> Optional[Prompt]:
682
+ try:
683
+ old_prompt = parent_prompt_configuration.prompts[selected_module_id]
684
+ except KeyError as exc:
685
+ raise DeepEvalError(
686
+ "COPRORunner expected a prompt for module_id "
687
+ f"{selected_module_id!r} but none was found in the "
688
+ "current prompt configuration."
689
+ ) from exc
690
+
691
+ new_prompt = await self._rewriter.a_rewrite(
692
+ module_id=selected_module_id,
693
+ old_prompt=old_prompt,
694
+ feedback_text=feedback_text,
695
+ )
696
+
697
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
698
+ old_prompt, new_prompt
699
+ ):
700
+ # Don't accept if new prompt is the same as parent, or if type changed.
701
+ return None
702
+ return new_prompt
703
+
704
+ def _generate_child_prompt(
705
+ self,
706
+ selected_module_id: ModuleId,
707
+ parent_prompt_configuration: PromptConfiguration,
708
+ feedback_text: str,
709
+ ) -> Optional[Prompt]:
710
+ try:
711
+ old_prompt = parent_prompt_configuration.prompts[selected_module_id]
712
+ except KeyError as exc:
713
+ # This should never happen in normal operation.
714
+ raise DeepEvalError(
715
+ "COPRORunner expected a prompt for module_id "
716
+ f"{selected_module_id!r} but none was found in the "
717
+ "current prompt configuration."
718
+ ) from exc
719
+
720
+ new_prompt = self._rewriter.rewrite(
721
+ module_id=selected_module_id,
722
+ old_prompt=old_prompt,
723
+ feedback_text=feedback_text,
724
+ )
725
+
726
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
727
+ old_prompt, new_prompt
728
+ ):
729
+ # Don't accept if new prompt is the same as parent, or if type changed.
730
+ return None
731
+ return new_prompt
732
+
733
+ def _make_child(
734
+ self,
735
+ selected_module_id: ModuleId,
736
+ parent_prompt_configuration: PromptConfiguration,
737
+ child_prompt: Prompt,
738
+ ) -> PromptConfiguration:
739
+ child_prompt_configuration = PromptConfiguration.new(
740
+ prompts=dict(parent_prompt_configuration.prompts),
741
+ parent=parent_prompt_configuration.id,
742
+ )
743
+ child_prompt_configuration.prompts[selected_module_id] = child_prompt
744
+ return child_prompt_configuration
745
+
746
+ def _update_progress(
747
+ self,
748
+ total_iterations: int,
749
+ iteration: int,
750
+ remaining_iterations: int,
751
+ elapsed: float,
752
+ ) -> None:
753
+ if self.status_callback is not None:
754
+ detail = (
755
+ f"(iterations={total_iterations}) "
756
+ f"• iteration {iteration}/{total_iterations} "
757
+ f"• {elapsed:.2f}s • remaining={remaining_iterations}"
758
+ )
759
+ self.status_callback(
760
+ RunnerStatusType.PROGRESS,
761
+ step_index=iteration,
762
+ total_steps=total_iterations,
763
+ detail=detail,
764
+ )
765
+
766
+ def _update_error(
767
+ self,
768
+ total_iterations: int,
769
+ iteration: int,
770
+ exc: Exception,
771
+ ) -> None:
772
+ # Report a user-facing error event.
773
+ if self.status_callback is not None:
774
+ detail = (
775
+ f"(iterations={total_iterations}) "
776
+ f"• error {exc.__class__.__name__}: {exc} "
777
+ f"• halted at iteration {iteration}"
778
+ )
779
+ self.status_callback(
780
+ RunnerStatusType.ERROR,
781
+ step_index=iteration,
782
+ total_steps=total_iterations,
783
+ detail=detail,
784
+ )
785
+
786
+ def _run_loop_iteration(
787
+ self,
788
+ copro_iteration: Callable[[], bool],
789
+ ) -> None:
790
+ total_iterations = self.iterations
791
+ remaining_iterations = total_iterations
792
+ iteration = 0
793
+ self._update_progress(
794
+ total_iterations, iteration, remaining_iterations, 0.0
795
+ )
796
+ while remaining_iterations > 0:
797
+ iteration += 1
798
+ start_time = time.perf_counter()
799
+ try:
800
+ ok = copro_iteration()
801
+ except Exception as exc:
802
+ self._update_error(total_iterations, iteration, exc)
803
+ break
804
+ elapsed = time.perf_counter() - start_time
805
+ if not ok:
806
+ break
807
+ remaining_iterations -= 1
808
+ self._update_progress(
809
+ total_iterations, iteration, remaining_iterations, elapsed
810
+ )
811
+
812
+ async def _a_run_loop_iteration(
813
+ self,
814
+ a_copro_iteration: Callable[[], Awaitable[bool]],
815
+ ) -> None:
816
+ total_iterations = self.iterations
817
+ remaining_iterations = total_iterations
818
+ iteration = 0
819
+ self._update_progress(
820
+ total_iterations, iteration, remaining_iterations, 0.0
821
+ )
822
+ while remaining_iterations > 0:
823
+ iteration += 1
824
+ start_time = time.perf_counter()
825
+ try:
826
+ ok = await a_copro_iteration()
827
+ except Exception as exc:
828
+ self._update_error(total_iterations, iteration, exc)
829
+ break
830
+ elapsed = time.perf_counter() - start_time
831
+ if not ok:
832
+ break
833
+ remaining_iterations -= 1
834
+ self._update_progress(
835
+ total_iterations, iteration, remaining_iterations, elapsed
836
+ )