deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,999 @@
1
+ # - SIMBA-style multi-strategy 0-shot variant:
2
+ # - Works on a single set of goldens (no D_pareto split).
3
+ # - Maintains a bounded population of candidate prompts
4
+ # (size controlled by `population_size`).
5
+ # - At each iteration:
6
+ # - Select a parent via epsilon-greedy on mean minibatch score.
7
+ # - Sample a minibatch of goldens for scoring.
8
+ # - Compute feedback once for the parent + minibatch.
9
+ # - Propose multiple child prompts cooperatively from the same parent
10
+ # (up to `proposals_per_step` children), each using a SIMBA edit
11
+ # strategy (e.g., APPEND_DEMO or APPEND_RULE).
12
+ # - For each child, accept it if its minibatch score improves on the
13
+ # parent by at least `min_delta`, add it to the pool, and prune
14
+ # low-scoring candidates if the population exceeds `population_size`.
15
+ # - Uses `full_eval_every` (if set) to periodically re-score the current
16
+ # best candidate on the full golden set.
17
+
18
+ from __future__ import annotations
19
+
20
+ import random
21
+ import time
22
+ import uuid
23
+ from typing import (
24
+ Awaitable,
25
+ Callable,
26
+ Dict,
27
+ List,
28
+ Optional,
29
+ Tuple,
30
+ Union,
31
+ )
32
+
33
+ from deepeval.models.base_model import DeepEvalBaseLLM
34
+
35
+ from deepeval.errors import DeepEvalError
36
+ from deepeval.dataset.golden import ConversationalGolden, Golden
37
+ from deepeval.optimizer.utils import Aggregator, mean_of_all
38
+ from deepeval.optimizer.types import (
39
+ AcceptedIterationDict,
40
+ ModuleId,
41
+ OptimizationReport,
42
+ PromptConfiguration,
43
+ PromptConfigurationId,
44
+ RunnerStatusCallback,
45
+ RunnerStatusType,
46
+ ScoreTable,
47
+ )
48
+ from deepeval.optimizer.scorer.base import BaseScorer
49
+ from deepeval.optimizer.algorithms.base import BaseAlgorithm
50
+ from deepeval.optimizer.utils import build_prompt_config_snapshots
51
+ from deepeval.prompt.api import PromptType
52
+ from deepeval.prompt.prompt import Prompt
53
+ from deepeval.optimizer.rewriter import Rewriter
54
+
55
+ from deepeval.optimizer.algorithms.configs import (
56
+ MIPROV2_MIN_DELTA,
57
+ MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS,
58
+ SIMBA_DEMO_INPUT_MAX_CHARS,
59
+ )
60
+ from deepeval.optimizer.algorithms.simba.types import SIMBAStrategy
61
+
62
+
63
+ class SIMBA(BaseAlgorithm):
64
+ """
65
+ SIMBA-style cooperative prompt optimization loop with sync/async execution.
66
+
67
+ This runner is intentionally low level and does not know about metrics,
68
+ models, or async configs. It relies on a preconfigured Scorer and
69
+ Rewriter, which are typically constructed by PromptOptimizer.
70
+
71
+ Parameters
72
+ ----------
73
+ iterations : int
74
+ Total number of optimization trials. Default is 5.
75
+ minibatch_size : int
76
+ Number of examples drawn per iteration. Default is 8.
77
+ random_seed : int, optional
78
+ RNG seed for reproducibility. If None, derived from time.time_ns().
79
+ exploration_probability : float
80
+ Epsilon greedy exploration rate. Default is 0.2.
81
+ full_eval_every : int, optional
82
+ Fully evaluate best candidate every N trials. Default is 5.
83
+ population_size : int
84
+ Maximum number of candidates in the pool. Default is 4.
85
+ proposals_per_step : int
86
+ Number of child prompts proposed per iteration. Default is 4.
87
+ max_demos_per_proposal : int
88
+ Maximum demos from minibatch for APPEND_DEMO strategy. Default is 3.
89
+ """
90
+
91
+ name = "SIMBA"
92
+ SINGLE_MODULE_ID: ModuleId = "__module__"
93
+
94
+ def __init__(
95
+ self,
96
+ iterations: int = 5,
97
+ minibatch_size: int = 8,
98
+ random_seed: Optional[int] = None,
99
+ exploration_probability: float = 0.2,
100
+ full_eval_every: Optional[int] = 5,
101
+ population_size: int = 4,
102
+ proposals_per_step: int = 4,
103
+ max_demos_per_proposal: int = 3,
104
+ aggregate_instances: Aggregator = mean_of_all,
105
+ scorer: Optional[BaseScorer] = None,
106
+ ) -> None:
107
+ # Validate parameters
108
+ if iterations < 1:
109
+ raise ValueError("iterations must be >= 1")
110
+ if minibatch_size < 1:
111
+ raise ValueError("minibatch_size must be >= 1")
112
+ if exploration_probability < 0.0 or exploration_probability > 1.0:
113
+ raise ValueError(
114
+ "exploration_probability must be >= 0.0 and <= 1.0"
115
+ )
116
+ if full_eval_every is not None and full_eval_every < 1:
117
+ raise ValueError("full_eval_every must be >= 1")
118
+ if population_size < 1:
119
+ raise ValueError("population_size must be >= 1")
120
+ if proposals_per_step < 1:
121
+ raise ValueError("proposals_per_step must be >= 1")
122
+ if max_demos_per_proposal < 0:
123
+ raise ValueError("max_demos_per_proposal must be >= 0")
124
+
125
+ self.iterations = iterations
126
+ self.minibatch_size = minibatch_size
127
+ self.exploration_probability = exploration_probability
128
+ self.full_eval_every = full_eval_every
129
+ self.population_size = population_size
130
+ self.proposals_per_step = proposals_per_step
131
+ self.max_demos_per_proposal = max_demos_per_proposal
132
+ self.aggregate_instances = aggregate_instances
133
+ self.scorer = scorer
134
+
135
+ if max_demos_per_proposal > 0:
136
+ self._strategies = [
137
+ SIMBAStrategy.APPEND_DEMO,
138
+ SIMBAStrategy.APPEND_RULE,
139
+ ]
140
+ else:
141
+ self._strategies = [SIMBAStrategy.APPEND_RULE]
142
+
143
+ # If no seed provided, use time-based seed
144
+ if random_seed is None:
145
+ random_seed = time.time_ns()
146
+ self.random_seed = random_seed
147
+ self.random_state = random.Random(random_seed)
148
+
149
+ # Runtime state to be reset between runs
150
+ self.reset_state()
151
+
152
+ # Status callback set by PromptOptimizer:
153
+ # (kind, step_index, total_steps, detail) -> None
154
+ self.status_callback: Optional[RunnerStatusCallback] = None
155
+
156
+ # Optimizer model used by the rewriter for prompt mutation.
157
+ # Set by PromptOptimizer.
158
+ self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
159
+
160
+ # Lazy-loaded Rewriter set by PromptOptimizer
161
+ self._rewriter: Optional[Rewriter] = None
162
+
163
+ ##############
164
+ # Public API #
165
+ ##############
166
+
167
+ def execute(
168
+ self,
169
+ prompt: Prompt,
170
+ goldens: Union[List[Golden], List[ConversationalGolden]],
171
+ ) -> Tuple[Prompt, OptimizationReport]:
172
+ """
173
+ Synchronous SIMBA run from a full list of goldens.
174
+
175
+ The full goldens set is used both for mini-batched scoring during
176
+ optimization and for a final full evaluation of the best candidate.
177
+ """
178
+ total_goldens = len(goldens)
179
+ if total_goldens < 1:
180
+ raise DeepEvalError(
181
+ "SIMBA prompt optimization requires at least 1 golden, but "
182
+ f"received {total_goldens}. Provide at least one golden to run "
183
+ "the optimizer."
184
+ )
185
+
186
+ self._ensure_scorer()
187
+ self.reset_state()
188
+
189
+ # Seed candidate pool with the root prompt configuration.
190
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
191
+ root_prompt_configuration = PromptConfiguration.new(
192
+ prompts=dict(seed_prompts_by_module)
193
+ )
194
+ # Add root candidate to the pool, but defer its first minibatch
195
+ # evaluation until the first iteration so that any long running
196
+ # model calls happen under the main loop (with progress updates).
197
+ self._add_prompt_configuration(root_prompt_configuration)
198
+
199
+ accepted_iterations: List[Dict] = []
200
+ self.trial_index = 0
201
+
202
+ def _one_iteration() -> bool:
203
+ nonlocal accepted_iterations
204
+
205
+ if not goldens:
206
+ return False
207
+
208
+ # Lazily seed with a minibatch score for the root
209
+ # candidate on the first iteration.
210
+ if not self._minibatch_score_counts:
211
+ seed_minibatch = self._draw_minibatch(goldens)
212
+ root_score = self.scorer.score_minibatch(
213
+ root_prompt_configuration, seed_minibatch
214
+ )
215
+ self._record_minibatch_score(
216
+ root_prompt_configuration.id, root_score
217
+ )
218
+
219
+ # 1. Choose which candidate prompt to mutate.
220
+ parent_prompt_configuration = self._select_candidate()
221
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
222
+
223
+ minibatch = self._draw_minibatch(goldens)
224
+
225
+ # Compute shared feedback for this parent/minibatch that will be
226
+ # used by all SIMBA proposals in this iteration.
227
+ feedback_text = self.scorer.get_minibatch_feedback(
228
+ parent_prompt_configuration, selected_module_id, minibatch
229
+ )
230
+
231
+ before_mean = self._mean_minibatch_score(
232
+ parent_prompt_configuration.id
233
+ )
234
+ jitter = 1e-6
235
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
236
+
237
+ # 2. Generate multiple SIMBA child prompts and evaluate them.
238
+ num_proposals = int(self.proposals_per_step)
239
+ for _ in range(num_proposals):
240
+ strategy = self._sample_strategy()
241
+ child_prompt = self._generate_child_prompt(
242
+ strategy,
243
+ selected_module_id,
244
+ parent_prompt_configuration,
245
+ feedback_text,
246
+ minibatch,
247
+ )
248
+ if child_prompt is None:
249
+ # No child, nothing to evaluate for this proposal.
250
+ continue
251
+
252
+ child_prompt_configuration = self._make_child(
253
+ selected_module_id,
254
+ parent_prompt_configuration,
255
+ child_prompt,
256
+ )
257
+
258
+ child_score = self.scorer.score_minibatch(
259
+ child_prompt_configuration, minibatch
260
+ )
261
+
262
+ # 3. Evaluate & decide whether to accept the child.
263
+ if child_score >= before_mean + min_delta:
264
+ # Accept: add to pool, update surrogate stats, and record iteration.
265
+ self._add_prompt_configuration(child_prompt_configuration)
266
+ self._record_minibatch_score(
267
+ child_prompt_configuration.id, child_score
268
+ )
269
+
270
+ accepted_iterations.append(
271
+ AcceptedIterationDict(
272
+ parent=parent_prompt_configuration.id,
273
+ child=child_prompt_configuration.id,
274
+ module=selected_module_id,
275
+ before=before_mean,
276
+ after=child_score,
277
+ )
278
+ )
279
+ # else: reject; do not add child to the candidate pool.
280
+
281
+ self.trial_index += 1
282
+ if (
283
+ self.full_eval_every is not None
284
+ and self.trial_index % self.full_eval_every == 0
285
+ ):
286
+ self._full_evaluate_best(goldens)
287
+
288
+ return True
289
+
290
+ self._run_loop_iteration(_one_iteration)
291
+
292
+ # Ensure at least one candidate has been fully evaluated.
293
+ if not self.pareto_score_table:
294
+ self._full_evaluate_best(goldens)
295
+
296
+ best = self._best_by_aggregate()
297
+ prompt_config_snapshots = build_prompt_config_snapshots(
298
+ self.prompt_configurations_by_id
299
+ )
300
+ report = OptimizationReport(
301
+ optimization_id=self.optimization_id,
302
+ best_id=best.id,
303
+ accepted_iterations=accepted_iterations,
304
+ pareto_scores=self.pareto_score_table,
305
+ parents=self.parents_by_id,
306
+ prompt_configurations=prompt_config_snapshots,
307
+ )
308
+ return best.prompts[self.SINGLE_MODULE_ID], report
309
+
310
+ async def a_execute(
311
+ self,
312
+ prompt: Prompt,
313
+ goldens: Union[List[Golden], List[ConversationalGolden]],
314
+ ) -> Tuple[Prompt, OptimizationReport]:
315
+ """
316
+ Asynchronous twin of execute().
317
+ """
318
+ total_goldens = len(goldens)
319
+ if total_goldens < 1:
320
+ raise DeepEvalError(
321
+ "SIMBA prompt optimization requires at least 1 golden, but "
322
+ f"received {total_goldens}. Provide at least one golden to run "
323
+ "the optimizer."
324
+ )
325
+
326
+ self._ensure_scorer()
327
+ self.reset_state()
328
+
329
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
330
+ root_prompt_configuration = PromptConfiguration.new(
331
+ prompts=dict(seed_prompts_by_module)
332
+ )
333
+ self._add_prompt_configuration(root_prompt_configuration)
334
+
335
+ accepted_iterations: List[Dict] = []
336
+ self.trial_index = 0
337
+
338
+ async def _one_iteration() -> bool:
339
+ nonlocal accepted_iterations
340
+
341
+ if not goldens:
342
+ return False
343
+
344
+ if not self._minibatch_score_counts:
345
+ seed_minibatch = self._draw_minibatch(goldens)
346
+ root_score = await self.scorer.a_score_minibatch(
347
+ root_prompt_configuration, seed_minibatch
348
+ )
349
+ self._record_minibatch_score(
350
+ root_prompt_configuration.id, root_score
351
+ )
352
+
353
+ parent_prompt_configuration = self._select_candidate()
354
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
355
+
356
+ minibatch = self._draw_minibatch(goldens)
357
+
358
+ feedback_text = await self.scorer.a_get_minibatch_feedback(
359
+ parent_prompt_configuration, selected_module_id, minibatch
360
+ )
361
+
362
+ before_mean = self._mean_minibatch_score(
363
+ parent_prompt_configuration.id
364
+ )
365
+ jitter = 1e-6
366
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
367
+
368
+ num_proposals = int(self.proposals_per_step)
369
+ for _ in range(num_proposals):
370
+ strategy = self._sample_strategy()
371
+ child_prompt = await self._a_generate_child_prompt(
372
+ strategy,
373
+ selected_module_id,
374
+ parent_prompt_configuration,
375
+ feedback_text,
376
+ minibatch,
377
+ )
378
+ if child_prompt is None:
379
+ continue
380
+
381
+ child_prompt_configuration = self._make_child(
382
+ selected_module_id,
383
+ parent_prompt_configuration,
384
+ child_prompt,
385
+ )
386
+
387
+ child_score = await self.scorer.a_score_minibatch(
388
+ child_prompt_configuration, minibatch
389
+ )
390
+
391
+ if child_score >= before_mean + min_delta:
392
+ self._add_prompt_configuration(child_prompt_configuration)
393
+ self._record_minibatch_score(
394
+ child_prompt_configuration.id, child_score
395
+ )
396
+
397
+ accepted_iterations.append(
398
+ AcceptedIterationDict(
399
+ parent=parent_prompt_configuration.id,
400
+ child=child_prompt_configuration.id,
401
+ module=selected_module_id,
402
+ before=before_mean,
403
+ after=child_score,
404
+ )
405
+ )
406
+
407
+ self.trial_index += 1
408
+ if (
409
+ self.full_eval_every is not None
410
+ and self.trial_index % self.full_eval_every == 0
411
+ ):
412
+ await self._a_full_evaluate_best(goldens)
413
+
414
+ return True
415
+
416
+ await self._a_run_loop_iteration(_one_iteration)
417
+
418
+ if not self.pareto_score_table:
419
+ await self._a_full_evaluate_best(goldens)
420
+
421
+ best = self._best_by_aggregate()
422
+ prompt_config_snapshots = build_prompt_config_snapshots(
423
+ self.prompt_configurations_by_id
424
+ )
425
+ report = OptimizationReport(
426
+ optimization_id=self.optimization_id,
427
+ best_id=best.id,
428
+ accepted_iterations=accepted_iterations,
429
+ pareto_scores=self.pareto_score_table,
430
+ parents=self.parents_by_id,
431
+ prompt_configurations=prompt_config_snapshots,
432
+ )
433
+ return best.prompts[self.SINGLE_MODULE_ID], report
434
+
435
+ ###################
436
+ # State & helpers #
437
+ ###################
438
+
439
+ def reset_state(self) -> None:
440
+ self.optimization_id = str(uuid.uuid4())
441
+ self.prompt_configurations_by_id: Dict[
442
+ PromptConfigurationId, PromptConfiguration
443
+ ] = {}
444
+ self.parents_by_id: Dict[
445
+ PromptConfigurationId, Optional[PromptConfigurationId]
446
+ ] = {}
447
+ # For SIMBA we reuse the same field name as GEPA for full-eval scores.
448
+ self.pareto_score_table: ScoreTable = {}
449
+
450
+ # Surrogate stats: running mean minibatch scores per candidate.
451
+ self._minibatch_score_sums: Dict[PromptConfigurationId, float] = {}
452
+ self._minibatch_score_counts: Dict[PromptConfigurationId, int] = {}
453
+
454
+ # Trial counter (used for full_eval_every).
455
+ self.trial_index: int = 0
456
+
457
+ def _ensure_scorer(self) -> None:
458
+ if self.scorer is None:
459
+ raise DeepEvalError(
460
+ "SIMBARunner requires a `scorer`. "
461
+ "Construct one (for example, Scorer) in "
462
+ "PromptOptimizer and assign it to `runner.scorer`."
463
+ )
464
+
465
+ def _prompts_equivalent(
466
+ self,
467
+ old_prompt: Prompt,
468
+ new_prompt: Prompt,
469
+ ) -> bool:
470
+ """
471
+ Compare two Prompts for optimization purposes.
472
+
473
+ We treat a child as "no change" if:
474
+ - The types differ, or
475
+ - For TEXT: trimmed text_template matches.
476
+ - For LIST: messages_template length, roles, and trimmed content match.
477
+ """
478
+
479
+ if new_prompt.type == PromptType.LIST:
480
+ old_msgs = old_prompt.messages_template
481
+ new_msgs = new_prompt.messages_template
482
+ if len(old_msgs) != len(new_msgs):
483
+ return False
484
+
485
+ for old_msg, new_msg in zip(old_msgs, new_msgs):
486
+ if old_msg.role != new_msg.role:
487
+ return False
488
+ if (old_msg.content or "").strip() != (
489
+ new_msg.content or ""
490
+ ).strip():
491
+ return False
492
+
493
+ return True
494
+
495
+ old_txt = (old_prompt.text_template or "").strip()
496
+ new_txt = (new_prompt.text_template or "").strip()
497
+ return new_txt == old_txt
498
+
499
+ def _add_prompt_configuration(
500
+ self,
501
+ prompt_configuration: PromptConfiguration,
502
+ ) -> None:
503
+ """
504
+ Add a candidate to the active pool and, if a population limit is set,
505
+ prune the worst-scoring candidates to enforce it.
506
+ """
507
+ self.prompt_configurations_by_id[prompt_configuration.id] = (
508
+ prompt_configuration
509
+ )
510
+ self.parents_by_id[prompt_configuration.id] = (
511
+ prompt_configuration.parent
512
+ )
513
+
514
+ # If we exceed the population size, iteratively prune the worst
515
+ # (by mean minibatch score), never removing the current best.
516
+ while len(self.prompt_configurations_by_id) > self.population_size:
517
+ best_id: Optional[PromptConfigurationId] = None
518
+ best_score = float("-inf")
519
+ for cand_id in self.prompt_configurations_by_id.keys():
520
+ mean_score = self._mean_minibatch_score(cand_id)
521
+ if mean_score > best_score:
522
+ best_score = mean_score
523
+ best_id = cand_id
524
+
525
+ worst_id: Optional[PromptConfigurationId] = None
526
+ worst_score = float("inf")
527
+ for cand_id in self.prompt_configurations_by_id.keys():
528
+ if cand_id == best_id:
529
+ continue
530
+ mean_score = self._mean_minibatch_score(cand_id)
531
+ if mean_score < worst_score:
532
+ worst_score = mean_score
533
+ worst_id = cand_id
534
+
535
+ if worst_id is None or worst_id == best_id:
536
+ break
537
+
538
+ # Prune the chosen worst candidate from all bookkeeping tables.
539
+ self.prompt_configurations_by_id.pop(worst_id, None)
540
+ self.parents_by_id.pop(worst_id, None)
541
+ self._minibatch_score_sums.pop(worst_id, None)
542
+ self._minibatch_score_counts.pop(worst_id, None)
543
+ self.pareto_score_table.pop(worst_id, None)
544
+
545
+ def _record_minibatch_score(
546
+ self,
547
+ prompt_configuration_id: PromptConfigurationId,
548
+ score: float,
549
+ ) -> None:
550
+ self._minibatch_score_sums[prompt_configuration_id] = (
551
+ self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
552
+ + float(score)
553
+ )
554
+ self._minibatch_score_counts[prompt_configuration_id] = (
555
+ self._minibatch_score_counts.get(prompt_configuration_id, 0) + 1
556
+ )
557
+
558
+ def _mean_minibatch_score(
559
+ self,
560
+ prompt_configuration_id: PromptConfigurationId,
561
+ ) -> float:
562
+ total = self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
563
+ count = self._minibatch_score_counts.get(prompt_configuration_id, 0)
564
+ if count <= 0:
565
+ # Use a sentinel that will not dominate selection if a scored
566
+ # candidate exists. Root is seeded explicitly in the first iteration.
567
+ return float("-inf")
568
+ return total / count
569
+
570
+ def _best_by_minibatch(self) -> PromptConfiguration:
571
+ """
572
+ Return the candidate with the highest mean minibatch score.
573
+ """
574
+ if not self.prompt_configurations_by_id:
575
+ raise DeepEvalError(
576
+ "SIMBARunner has no prompt configurations; this should not happen."
577
+ )
578
+
579
+ best_id: Optional[PromptConfigurationId] = None
580
+ best_score = float("-inf")
581
+
582
+ for cand_id in self.prompt_configurations_by_id.keys():
583
+ mean_score = self._mean_minibatch_score(cand_id)
584
+ if mean_score > best_score:
585
+ best_score = mean_score
586
+ best_id = cand_id
587
+
588
+ if best_id is None:
589
+ # Fallback to the first candidate if all means are -inf.
590
+ best_id = next(iter(self.prompt_configurations_by_id.keys()))
591
+
592
+ return self.prompt_configurations_by_id[best_id]
593
+
594
+ def _best_by_aggregate(self) -> PromptConfiguration:
595
+ """
596
+ Return the best candidate based on full-eval scores.
597
+
598
+ If no full evaluation scores are available (should be rare, but possible if
599
+ full_eval_every is very large and the loop exits early), fall back to
600
+ best-by-minibatch.
601
+ """
602
+ if not self.pareto_score_table:
603
+ return self._best_by_minibatch()
604
+
605
+ totals = {
606
+ prompt_configuration_id: self.aggregate_instances(vector)
607
+ for prompt_configuration_id, vector in self.pareto_score_table.items()
608
+ }
609
+
610
+ best_ids: List[PromptConfigurationId] = []
611
+ best_val = float("-inf")
612
+
613
+ for cand_id, aggregate in totals.items():
614
+ if aggregate > best_val + 1e-12:
615
+ best_val = aggregate
616
+ best_ids = [cand_id]
617
+ elif abs(aggregate - best_val) <= 1e-12:
618
+ best_ids.append(cand_id)
619
+
620
+ chosen_id = self.random_state.choice(best_ids)
621
+ return self.prompt_configurations_by_id[chosen_id]
622
+
623
+ def _select_candidate(self) -> PromptConfiguration:
624
+ """
625
+ Epsilon-greedy candidate selection:
626
+
627
+ - With probability ``exploration_probability``, pick a random candidate.
628
+ - Otherwise, pick the candidate with the highest mean minibatch score.
629
+ """
630
+ if not self.prompt_configurations_by_id:
631
+ raise DeepEvalError(
632
+ "SIMBARunner has no prompt configurations to select from."
633
+ )
634
+
635
+ candidate_ids = list(self.prompt_configurations_by_id.keys())
636
+ if not candidate_ids:
637
+ raise DeepEvalError(
638
+ "SIMBARunner has an empty candidate pool; this should not happen."
639
+ )
640
+
641
+ eps = float(self.exploration_probability)
642
+ if eps > 0.0 and self.random_state.random() < eps:
643
+ chosen_id = self.random_state.choice(candidate_ids)
644
+ else:
645
+ chosen_id = self._best_by_minibatch().id
646
+
647
+ return self.prompt_configurations_by_id[chosen_id]
648
+
649
+ def _draw_minibatch(
650
+ self,
651
+ goldens: Union[List[Golden], List[ConversationalGolden]],
652
+ ) -> Union[List[Golden], List[ConversationalGolden]]:
653
+ """
654
+ Determine effective minibatch size, bounded by the available goldens,
655
+ and sample with replacement.
656
+ """
657
+ n = len(goldens)
658
+ if n <= 0:
659
+ return []
660
+
661
+ size = min(self.minibatch_size, n)
662
+
663
+ return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
664
+
665
+ async def _a_full_evaluate_best(
666
+ self,
667
+ goldens: Union[List[Golden], List[ConversationalGolden]],
668
+ ) -> None:
669
+ if not self.prompt_configurations_by_id:
670
+ return
671
+
672
+ best = self._best_by_minibatch()
673
+ if best.id in self.pareto_score_table:
674
+ return
675
+
676
+ scores = await self.scorer.a_score_pareto(best, goldens)
677
+ self.pareto_score_table[best.id] = scores
678
+
679
+ def _full_evaluate_best(
680
+ self,
681
+ goldens: Union[List[Golden], List[ConversationalGolden]],
682
+ ) -> None:
683
+ if not self.prompt_configurations_by_id:
684
+ return
685
+
686
+ best = self._best_by_minibatch()
687
+ if best.id in self.pareto_score_table:
688
+ return
689
+
690
+ scores = self.scorer.score_pareto(best, goldens)
691
+ self.pareto_score_table[best.id] = scores
692
+
693
+ async def _a_generate_child_prompt(
694
+ self,
695
+ strategy: SIMBAStrategy,
696
+ selected_module_id: ModuleId,
697
+ parent_prompt_configuration: PromptConfiguration,
698
+ feedback_text: str,
699
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
700
+ ) -> Optional[Prompt]:
701
+ try:
702
+ old_prompt = parent_prompt_configuration.prompts[selected_module_id]
703
+ except KeyError as exc:
704
+ raise DeepEvalError(
705
+ "SIMBARunner expected a prompt for module_id "
706
+ f"{selected_module_id!r} but none was found in the "
707
+ "current prompt configuration."
708
+ ) from exc
709
+
710
+ strategy_feedback = self._build_feedback_for_strategy(
711
+ strategy, feedback_text, minibatch
712
+ )
713
+
714
+ new_prompt = await self._rewriter.a_rewrite(
715
+ module_id=selected_module_id,
716
+ old_prompt=old_prompt,
717
+ feedback_text=strategy_feedback,
718
+ )
719
+
720
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
721
+ old_prompt, new_prompt
722
+ ):
723
+ # Don't accept if new prompt is the same as parent, or if type changed.
724
+ return None
725
+ return new_prompt
726
+
727
+ def _generate_child_prompt(
728
+ self,
729
+ strategy: SIMBAStrategy,
730
+ selected_module_id: ModuleId,
731
+ parent_prompt_configuration: PromptConfiguration,
732
+ feedback_text: str,
733
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
734
+ ) -> Optional[Prompt]:
735
+ try:
736
+ old_prompt = parent_prompt_configuration.prompts[selected_module_id]
737
+ except KeyError as exc:
738
+ # This should never happen in normal operation.
739
+ raise DeepEvalError(
740
+ "SIMBARunner expected a prompt for module_id "
741
+ f"{selected_module_id!r} but none was found in the "
742
+ "current prompt configuration."
743
+ ) from exc
744
+
745
+ strategy_feedback = self._build_feedback_for_strategy(
746
+ strategy, feedback_text, minibatch
747
+ )
748
+
749
+ new_prompt = self._rewriter.rewrite(
750
+ module_id=selected_module_id,
751
+ old_prompt=old_prompt,
752
+ feedback_text=strategy_feedback,
753
+ )
754
+
755
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
756
+ old_prompt, new_prompt
757
+ ):
758
+ # Don't accept if new prompt is the same as parent, or if type changed.
759
+ return None
760
+ return new_prompt
761
+
762
+ def _make_child(
763
+ self,
764
+ selected_module_id: ModuleId,
765
+ parent_prompt_configuration: PromptConfiguration,
766
+ child_prompt: Prompt,
767
+ ) -> PromptConfiguration:
768
+ child_prompt_configuration = PromptConfiguration.new(
769
+ prompts=dict(parent_prompt_configuration.prompts),
770
+ parent=parent_prompt_configuration.id,
771
+ )
772
+ child_prompt_configuration.prompts[selected_module_id] = child_prompt
773
+ return child_prompt_configuration
774
+
775
+ def _truncate_instruction(self, text: str) -> str:
776
+ """
777
+ Truncate strategy instructions + feedback to the configured character
778
+ budget so the rewriter prompt does not explode.
779
+ """
780
+ max_chars = MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS
781
+ if max_chars <= 0:
782
+ return text
783
+ if len(text) <= max_chars:
784
+ return text
785
+ return text[:max_chars]
786
+
787
+ def _build_demo_block(
788
+ self,
789
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
790
+ ) -> str:
791
+ """
792
+ Build a small block of input/context/output demos from the current
793
+ minibatch, inspired by SIMBA's `append_a_demo` strategy.
794
+
795
+ For each Golden:
796
+
797
+ Golden:
798
+ Input <- golden.input
799
+ Context <- " ".join(golden.context) if present
800
+ Output <- golden.expected_output
801
+
802
+ ConversationalGolden:
803
+ Input <- golden.scenario
804
+ Context <- " ".join(golden.context) if present
805
+ Output <- golden.expected_outcome
806
+
807
+ All text segments are independently truncated to `SIMBA_DEMO_INPUT_MAX_CHARS`.
808
+ """
809
+ max_demos = self.max_demos_per_proposal
810
+ if max_demos <= 0:
811
+ return ""
812
+
813
+ lines: List[str] = []
814
+ demo_limit = min(max_demos, len(minibatch))
815
+ max_chars = SIMBA_DEMO_INPUT_MAX_CHARS
816
+
817
+ for golden in minibatch[:demo_limit]:
818
+ if isinstance(golden, Golden):
819
+ input_text = golden.input or ""
820
+ expected_output_text = golden.expected_output or ""
821
+ ctx_list = golden.context or []
822
+ elif isinstance(golden, ConversationalGolden):
823
+ input_text = golden.scenario or ""
824
+ expected_output_text = golden.expected_outcome or ""
825
+ ctx_list = golden.context or []
826
+ else:
827
+ # Unknown type; skip defensively
828
+ continue
829
+
830
+ context_text = " ".join(ctx_list) if ctx_list else ""
831
+
832
+ # Skip completely empty triples
833
+ if not input_text and not expected_output_text and not context_text:
834
+ continue
835
+
836
+ # Truncate each segment independently
837
+ if max_chars > 0:
838
+ if len(input_text) > max_chars:
839
+ input_text = input_text[:max_chars]
840
+ if len(context_text) > max_chars:
841
+ context_text = context_text[:max_chars]
842
+ if len(expected_output_text) > max_chars:
843
+ expected_output_text = expected_output_text[:max_chars]
844
+
845
+ demo_lines: List[str] = [f"Input: {input_text}"]
846
+ if context_text:
847
+ demo_lines.append(f"Context: {context_text}")
848
+ demo_lines.append(f"Output: {expected_output_text}")
849
+
850
+ lines.append("\n".join(demo_lines))
851
+
852
+ return "\n\n".join(lines)
853
+
854
+ def _build_feedback_for_strategy(
855
+ self,
856
+ strategy: SIMBAStrategy,
857
+ feedback_text: str,
858
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
859
+ ) -> str:
860
+ """
861
+ Construct a strategy-specific feedback string that is passed into
862
+ Rewriter.rewrite / a_rewrite.
863
+
864
+ - APPEND_RULE: emphasize extracting a concise rule from metric feedback.
865
+ - APPEND_DEMO: emphasize appending concrete demos built from goldens.
866
+ """
867
+ base = (feedback_text or "").strip()
868
+
869
+ if strategy is SIMBAStrategy.APPEND_RULE:
870
+ prefix = (
871
+ "Strategy: Append a concise natural-language rule to the existing "
872
+ "prompt that addresses the issues described below. Preserve all "
873
+ "original instructions and add the new rule(s) in a clearly marked "
874
+ '"Rules" or "Guidelines" section.\n\n'
875
+ )
876
+ text = prefix
877
+ if base:
878
+ text += "Evaluation feedback:\n" + base
879
+ return self._truncate_instruction(text)
880
+
881
+ if strategy is SIMBAStrategy.APPEND_DEMO:
882
+ demos = self._build_demo_block(minibatch)
883
+ prefix = (
884
+ "Strategy: Append one or more concrete input/output demonstrations "
885
+ "to the prompt. Each demo should illustrate how to respond "
886
+ "correctly on similar inputs.\n\n"
887
+ )
888
+ text = prefix
889
+ if base:
890
+ text += "Evaluation feedback:\n" + base + "\n\n"
891
+ if demos:
892
+ text += (
893
+ "Candidate demos built from the current minibatch:\n"
894
+ + demos
895
+ )
896
+ return self._truncate_instruction(text)
897
+
898
+ # just pass through feedback.
899
+ return self._truncate_instruction(base)
900
+
901
+ def _sample_strategy(self) -> SIMBAStrategy:
902
+ """
903
+ Sample one of the configured SIMBA edit strategies.
904
+
905
+ Defaults to APPEND_RULE if the strategy list is empty for any reason.
906
+ """
907
+ return self.random_state.choice(self._strategies)
908
+
909
+ def _update_progress(
910
+ self,
911
+ total_iterations: int,
912
+ iteration: int,
913
+ remaining_iterations: int,
914
+ elapsed: float,
915
+ ) -> None:
916
+ if self.status_callback is not None:
917
+ detail = (
918
+ f"(iterations={total_iterations}) "
919
+ f"• iteration {iteration}/{total_iterations} "
920
+ f"• {elapsed:.2f}s • remaining={remaining_iterations}"
921
+ )
922
+ self.status_callback(
923
+ RunnerStatusType.PROGRESS,
924
+ step_index=iteration,
925
+ total_steps=total_iterations,
926
+ detail=detail,
927
+ )
928
+
929
+ def _update_error(
930
+ self,
931
+ total_iterations: int,
932
+ iteration: int,
933
+ exc: Exception,
934
+ ) -> None:
935
+ # Report a user-facing error event.
936
+ if self.status_callback is not None:
937
+ detail = (
938
+ f"(iterations={total_iterations}) "
939
+ f"• error {exc.__class__.__name__}: {exc} "
940
+ f"• halted at iteration {iteration}"
941
+ )
942
+ self.status_callback(
943
+ RunnerStatusType.ERROR,
944
+ step_index=iteration,
945
+ total_steps=total_iterations,
946
+ detail=detail,
947
+ )
948
+
949
+ def _run_loop_iteration(
950
+ self,
951
+ simba_iteration: Callable[[], bool],
952
+ ) -> None:
953
+ total_iterations = self.iterations
954
+ remaining_iterations = total_iterations
955
+ iteration = 0
956
+ self._update_progress(
957
+ total_iterations, iteration, remaining_iterations, 0.0
958
+ )
959
+ while remaining_iterations > 0:
960
+ iteration += 1
961
+ start_time = time.perf_counter()
962
+ try:
963
+ ok = simba_iteration()
964
+ except Exception as exc:
965
+ self._update_error(total_iterations, iteration, exc)
966
+ break
967
+ elapsed = time.perf_counter() - start_time
968
+ if not ok:
969
+ break
970
+ remaining_iterations -= 1
971
+ self._update_progress(
972
+ total_iterations, iteration, remaining_iterations, elapsed
973
+ )
974
+
975
+ async def _a_run_loop_iteration(
976
+ self,
977
+ a_simba_iteration: Callable[[], Awaitable[bool]],
978
+ ) -> None:
979
+ total_iterations = self.iterations
980
+ remaining_iterations = total_iterations
981
+ iteration = 0
982
+ self._update_progress(
983
+ total_iterations, iteration, remaining_iterations, 0.0
984
+ )
985
+ while remaining_iterations > 0:
986
+ iteration += 1
987
+ start_time = time.perf_counter()
988
+ try:
989
+ ok = await a_simba_iteration()
990
+ except Exception as exc:
991
+ self._update_error(total_iterations, iteration, exc)
992
+ break
993
+ elapsed = time.perf_counter() - start_time
994
+ if not ok:
995
+ break
996
+ remaining_iterations -= 1
997
+ self._update_progress(
998
+ total_iterations, iteration, remaining_iterations, elapsed
999
+ )