deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/evaluate/configs.py +1 -1
  5. deepeval/evaluate/execute.py +4 -1
  6. deepeval/metrics/answer_relevancy/template.py +4 -4
  7. deepeval/metrics/argument_correctness/template.py +2 -2
  8. deepeval/metrics/bias/template.py +3 -3
  9. deepeval/metrics/contextual_precision/template.py +6 -6
  10. deepeval/metrics/contextual_recall/template.py +2 -2
  11. deepeval/metrics/contextual_relevancy/template.py +3 -3
  12. deepeval/metrics/conversation_completeness/template.py +2 -2
  13. deepeval/metrics/conversational_dag/templates.py +4 -4
  14. deepeval/metrics/conversational_g_eval/template.py +4 -3
  15. deepeval/metrics/dag/templates.py +4 -4
  16. deepeval/metrics/faithfulness/template.py +4 -4
  17. deepeval/metrics/hallucination/template.py +4 -4
  18. deepeval/metrics/misuse/template.py +2 -2
  19. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
  20. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
  21. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
  22. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
  23. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
  24. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
  25. deepeval/metrics/non_advice/template.py +2 -2
  26. deepeval/metrics/pii_leakage/template.py +2 -2
  27. deepeval/metrics/prompt_alignment/template.py +4 -4
  28. deepeval/metrics/role_violation/template.py +2 -2
  29. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  30. deepeval/metrics/toxicity/template.py +4 -4
  31. deepeval/metrics/turn_relevancy/template.py +2 -2
  32. deepeval/models/embedding_models/azure_embedding_model.py +28 -15
  33. deepeval/models/embedding_models/local_embedding_model.py +23 -10
  34. deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
  35. deepeval/models/embedding_models/openai_embedding_model.py +18 -2
  36. deepeval/models/llms/anthropic_model.py +17 -5
  37. deepeval/models/llms/azure_model.py +30 -18
  38. deepeval/models/llms/deepseek_model.py +22 -12
  39. deepeval/models/llms/gemini_model.py +120 -87
  40. deepeval/models/llms/grok_model.py +23 -16
  41. deepeval/models/llms/kimi_model.py +23 -12
  42. deepeval/models/llms/litellm_model.py +63 -25
  43. deepeval/models/llms/local_model.py +26 -18
  44. deepeval/models/llms/ollama_model.py +17 -7
  45. deepeval/models/llms/openai_model.py +22 -17
  46. deepeval/models/llms/portkey_model.py +132 -0
  47. deepeval/models/mlllms/azure_model.py +28 -19
  48. deepeval/models/mlllms/gemini_model.py +102 -73
  49. deepeval/models/mlllms/ollama_model.py +40 -9
  50. deepeval/models/mlllms/openai_model.py +65 -14
  51. deepeval/models/utils.py +48 -3
  52. deepeval/optimization/__init__.py +13 -0
  53. deepeval/optimization/adapters/__init__.py +2 -0
  54. deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
  55. deepeval/optimization/aggregates.py +14 -0
  56. deepeval/optimization/configs.py +34 -0
  57. deepeval/optimization/copro/configs.py +31 -0
  58. deepeval/optimization/copro/loop.py +837 -0
  59. deepeval/optimization/gepa/__init__.py +7 -0
  60. deepeval/optimization/gepa/configs.py +115 -0
  61. deepeval/optimization/gepa/loop.py +677 -0
  62. deepeval/optimization/miprov2/configs.py +134 -0
  63. deepeval/optimization/miprov2/loop.py +785 -0
  64. deepeval/optimization/mutations/__init__.py +0 -0
  65. deepeval/optimization/mutations/prompt_rewriter.py +458 -0
  66. deepeval/optimization/policies/__init__.py +16 -0
  67. deepeval/optimization/policies/selection.py +166 -0
  68. deepeval/optimization/policies/tie_breaker.py +67 -0
  69. deepeval/optimization/prompt_optimizer.py +462 -0
  70. deepeval/optimization/simba/__init__.py +0 -0
  71. deepeval/optimization/simba/configs.py +33 -0
  72. deepeval/optimization/simba/loop.py +983 -0
  73. deepeval/optimization/simba/types.py +15 -0
  74. deepeval/optimization/types.py +361 -0
  75. deepeval/optimization/utils.py +598 -0
  76. deepeval/prompt/prompt.py +10 -5
  77. deepeval/test_run/cache.py +2 -0
  78. deepeval/test_run/test_run.py +6 -1
  79. deepeval/utils.py +24 -0
  80. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
  81. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
  82. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
  83. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
  84. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,983 @@
1
+ # - SIMBA-style multi-strategy 0-shot variant:
2
+ # - Works on a single set of goldens (no D_pareto split).
3
+ # - Maintains a bounded population of candidate prompts
4
+ # (size controlled by `population_size`).
5
+ # - At each iteration:
6
+ # - Select a parent via epsilon-greedy on mean minibatch score.
7
+ # - Sample a minibatch of goldens for scoring.
8
+ # - Compute feedback once for the parent + minibatch.
9
+ # - Propose multiple child prompts cooperatively from the same parent
10
+ # (up to `proposals_per_step` children), each using a SIMBA edit
11
+ # strategy (e.g., APPEND_DEMO or APPEND_RULE).
12
+ # - For each child, accept it if its minibatch score improves on the
13
+ # parent by at least `min_delta`, add it to the pool, and prune
14
+ # low-scoring candidates if the population exceeds `population_size`.
15
+ # - Uses `full_eval_every` (if set) to periodically re-score the current
16
+ # best candidate on the full golden set.
17
+
18
+ from __future__ import annotations
19
+
20
+ import random
21
+ import time
22
+ import uuid
23
+ from typing import (
24
+ Awaitable,
25
+ Callable,
26
+ Dict,
27
+ List,
28
+ Optional,
29
+ Tuple,
30
+ Union,
31
+ )
32
+
33
+ from deepeval.errors import DeepEvalError
34
+ from deepeval.dataset.golden import ConversationalGolden, Golden
35
+ from deepeval.optimization.aggregates import Aggregator, mean_of_all
36
+ from deepeval.optimization.types import (
37
+ AcceptedIterationDict,
38
+ ModuleId,
39
+ OptimizationResult,
40
+ PromptConfiguration,
41
+ PromptConfigurationId,
42
+ RunnerStatusCallbackProtocol,
43
+ RunnerStatusType,
44
+ ScoreTable,
45
+ ScoringAdapter,
46
+ )
47
+ from deepeval.optimization.utils import build_prompt_config_snapshots
48
+ from deepeval.prompt.api import PromptType
49
+ from deepeval.prompt.prompt import Prompt
50
+ from deepeval.optimization.mutations.prompt_rewriter import PromptRewriter
51
+
52
+ from .configs import SIMBAConfig
53
+ from .types import SIMBAStrategy
54
+
55
+
56
+ class SIMBARunner:
57
+ """
58
+ SIMBA-style cooperative prompt optimization loop with sync/async execution.
59
+
60
+ This runner is intentionally low level and does not know about metrics,
61
+ models, or async configs. It relies on a preconfigured ScoringAdapter and
62
+ PromptRewriter, which are typically constructed by PromptOptimizer.
63
+
64
+ - Optimizes a single Prompt (instruction) against a list of Goldens.
65
+ - Uses mini-batches of goldens for trial scoring and epsilon-greedy
66
+ selection over prompt candidates based on mean minibatch scores.
67
+ - At each iteration, proposes multiple child prompts using SIMBA-style
68
+ edit strategies (APPEND_DEMO and APPEND_RULE) by passing different
69
+ instructions into the PromptRewriter.
70
+ """
71
+
72
+ SINGLE_MODULE_ID: ModuleId = "__module__"
73
+
74
+ def __init__(
75
+ self,
76
+ *,
77
+ config: SIMBAConfig,
78
+ aggregate_instances: Aggregator = mean_of_all,
79
+ scoring_adapter: Optional[ScoringAdapter] = None,
80
+ ) -> None:
81
+ self.config = config
82
+ self.aggregate_instances = aggregate_instances
83
+ self.scoring_adapter = scoring_adapter
84
+
85
+ if config.max_demos_per_proposal > 0:
86
+ self._strategies = [
87
+ SIMBAStrategy.APPEND_DEMO,
88
+ SIMBAStrategy.APPEND_RULE,
89
+ ]
90
+ else:
91
+ self._strategies = [SIMBAStrategy.APPEND_RULE]
92
+
93
+ # Random seeded from config is used for minibatch sampling, strategy
94
+ # selection, and epsilon-greedy candidate selection.
95
+ self.random_state = random.Random(config.random_seed)
96
+
97
+ # Runtime state to be reset between runs
98
+ self.reset_state()
99
+
100
+ # Status callback set by PromptOptimizer:
101
+ # (kind, step_index, total_steps, detail) -> None
102
+ self.status_callback: Optional[RunnerStatusCallbackProtocol] = None
103
+
104
+ # Model callback used by the rewriter set by PromptOptimizer.
105
+ self.model_callback: Optional[
106
+ Callable[
107
+ ...,
108
+ Union[
109
+ str,
110
+ Dict,
111
+ Tuple[Union[str, Dict], float],
112
+ ],
113
+ ]
114
+ ] = None
115
+
116
+ # Lazy-loaded PromptRewriter set by PromptOptimizer
117
+ self._rewriter: Optional[PromptRewriter] = None
118
+
119
+ ##############
120
+ # Public API #
121
+ ##############
122
+
123
+ def execute(
124
+ self,
125
+ *,
126
+ prompt: Prompt,
127
+ goldens: Union[List[Golden], List[ConversationalGolden]],
128
+ ) -> Tuple[Prompt, Dict]:
129
+ """
130
+ Synchronous SIMBA run from a full list of goldens.
131
+
132
+ The full goldens set is used both for mini-batched scoring during
133
+ optimization and for a final full evaluation of the best candidate.
134
+ """
135
+ total_goldens = len(goldens)
136
+ if total_goldens < 1:
137
+ raise DeepEvalError(
138
+ "SIMBA prompt optimization requires at least 1 golden, but "
139
+ f"received {total_goldens}. Provide at least one golden to run "
140
+ "the optimizer."
141
+ )
142
+
143
+ self._ensure_scoring_adapter()
144
+ self._ensure_rewriter()
145
+ self.reset_state()
146
+
147
+ # Seed candidate pool with the root prompt configuration.
148
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
149
+ root_prompt_configuration = PromptConfiguration.new(
150
+ prompts=dict(seed_prompts_by_module)
151
+ )
152
+ # Add root candidate to the pool, but defer its first minibatch
153
+ # evaluation until the first iteration so that any long running
154
+ # model calls happen under the main loop (with progress updates).
155
+ self._add_prompt_configuration(root_prompt_configuration)
156
+
157
+ accepted_iterations: List[Dict] = []
158
+ self.trial_index = 0
159
+
160
+ def _one_iteration() -> bool:
161
+ nonlocal accepted_iterations
162
+
163
+ if not goldens:
164
+ return False
165
+
166
+ # Lazily seed with a minibatch score for the root
167
+ # candidate on the first iteration.
168
+ if not self._minibatch_score_counts:
169
+ seed_minibatch = self._draw_minibatch(goldens)
170
+ root_score = self.scoring_adapter.minibatch_score(
171
+ root_prompt_configuration, seed_minibatch
172
+ )
173
+ self._record_minibatch_score(
174
+ root_prompt_configuration.id, root_score
175
+ )
176
+
177
+ # 1. Choose which candidate prompt to mutate.
178
+ parent_prompt_configuration = self._select_candidate()
179
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
180
+
181
+ minibatch = self._draw_minibatch(goldens)
182
+
183
+ # Compute shared feedback for this parent/minibatch that will be
184
+ # used by all SIMBA proposals in this iteration.
185
+ feedback_text = self.scoring_adapter.minibatch_feedback(
186
+ parent_prompt_configuration, selected_module_id, minibatch
187
+ )
188
+
189
+ before_mean = self._mean_minibatch_score(
190
+ parent_prompt_configuration.id
191
+ )
192
+ jitter = 1e-6
193
+ min_delta = max(self.config.min_delta, jitter)
194
+
195
+ # 2. Generate multiple SIMBA child prompts and evaluate them.
196
+ num_proposals = int(self.config.proposals_per_step)
197
+ for _ in range(num_proposals):
198
+ strategy = self._sample_strategy()
199
+ child_prompt = self._generate_child_prompt(
200
+ strategy,
201
+ selected_module_id,
202
+ parent_prompt_configuration,
203
+ feedback_text,
204
+ minibatch,
205
+ )
206
+ if child_prompt is None:
207
+ # No child, nothing to evaluate for this proposal.
208
+ continue
209
+
210
+ child_prompt_configuration = self._make_child(
211
+ selected_module_id,
212
+ parent_prompt_configuration,
213
+ child_prompt,
214
+ )
215
+
216
+ child_score = self.scoring_adapter.minibatch_score(
217
+ child_prompt_configuration, minibatch
218
+ )
219
+
220
+ # 3. Evaluate & decide whether to accept the child.
221
+ if child_score >= before_mean + min_delta:
222
+ # Accept: add to pool, update surrogate stats, and record iteration.
223
+ self._add_prompt_configuration(child_prompt_configuration)
224
+ self._record_minibatch_score(
225
+ child_prompt_configuration.id, child_score
226
+ )
227
+
228
+ accepted_iterations.append(
229
+ AcceptedIterationDict(
230
+ parent=parent_prompt_configuration.id,
231
+ child=child_prompt_configuration.id,
232
+ module=selected_module_id,
233
+ before=before_mean,
234
+ after=child_score,
235
+ )
236
+ )
237
+ # else: reject; do not add child to the candidate pool.
238
+
239
+ self.trial_index += 1
240
+ if (
241
+ self.config.full_eval_every is not None
242
+ and self.trial_index % self.config.full_eval_every == 0
243
+ ):
244
+ self._full_evaluate_best(goldens)
245
+
246
+ return True
247
+
248
+ self._run_loop_iteration(_one_iteration)
249
+
250
+ # Ensure at least one candidate has been fully evaluated.
251
+ if not self.pareto_score_table:
252
+ self._full_evaluate_best(goldens)
253
+
254
+ best = self._best_by_aggregate()
255
+ prompt_config_snapshots = build_prompt_config_snapshots(
256
+ self.prompt_configurations_by_id
257
+ )
258
+ report = OptimizationResult(
259
+ optimization_id=self.optimization_id,
260
+ best_id=best.id,
261
+ accepted_iterations=accepted_iterations,
262
+ pareto_scores=self.pareto_score_table,
263
+ parents=self.parents_by_id,
264
+ prompt_configurations=prompt_config_snapshots,
265
+ )
266
+ return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
267
+
268
+ async def a_execute(
269
+ self,
270
+ *,
271
+ prompt: Prompt,
272
+ goldens: Union[List[Golden], List[ConversationalGolden]],
273
+ ) -> Tuple[Prompt, Dict]:
274
+ """
275
+ Asynchronous twin of execute().
276
+ """
277
+ total_goldens = len(goldens)
278
+ if total_goldens < 1:
279
+ raise DeepEvalError(
280
+ "SIMBA prompt optimization requires at least 1 golden, but "
281
+ f"received {total_goldens}. Provide at least one golden to run "
282
+ "the optimizer."
283
+ )
284
+
285
+ self._ensure_scoring_adapter()
286
+ self._ensure_rewriter()
287
+ self.reset_state()
288
+
289
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
290
+ root_prompt_configuration = PromptConfiguration.new(
291
+ prompts=dict(seed_prompts_by_module)
292
+ )
293
+ self._add_prompt_configuration(root_prompt_configuration)
294
+
295
+ accepted_iterations: List[Dict] = []
296
+ self.trial_index = 0
297
+
298
+ async def _one_iteration() -> bool:
299
+ nonlocal accepted_iterations
300
+
301
+ if not goldens:
302
+ return False
303
+
304
+ if not self._minibatch_score_counts:
305
+ seed_minibatch = self._draw_minibatch(goldens)
306
+ root_score = await self.scoring_adapter.a_minibatch_score(
307
+ root_prompt_configuration, seed_minibatch
308
+ )
309
+ self._record_minibatch_score(
310
+ root_prompt_configuration.id, root_score
311
+ )
312
+
313
+ parent_prompt_configuration = self._select_candidate()
314
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
315
+
316
+ minibatch = self._draw_minibatch(goldens)
317
+
318
+ feedback_text = await self.scoring_adapter.a_minibatch_feedback(
319
+ parent_prompt_configuration, selected_module_id, minibatch
320
+ )
321
+
322
+ before_mean = self._mean_minibatch_score(
323
+ parent_prompt_configuration.id
324
+ )
325
+ jitter = 1e-6
326
+ min_delta = max(self.config.min_delta, jitter)
327
+
328
+ num_proposals = int(self.config.proposals_per_step)
329
+ for _ in range(num_proposals):
330
+ strategy = self._sample_strategy()
331
+ child_prompt = await self._a_generate_child_prompt(
332
+ strategy,
333
+ selected_module_id,
334
+ parent_prompt_configuration,
335
+ feedback_text,
336
+ minibatch,
337
+ )
338
+ if child_prompt is None:
339
+ continue
340
+
341
+ child_prompt_configuration = self._make_child(
342
+ selected_module_id,
343
+ parent_prompt_configuration,
344
+ child_prompt,
345
+ )
346
+
347
+ child_score = await self.scoring_adapter.a_minibatch_score(
348
+ child_prompt_configuration, minibatch
349
+ )
350
+
351
+ if child_score >= before_mean + min_delta:
352
+ self._add_prompt_configuration(child_prompt_configuration)
353
+ self._record_minibatch_score(
354
+ child_prompt_configuration.id, child_score
355
+ )
356
+
357
+ accepted_iterations.append(
358
+ AcceptedIterationDict(
359
+ parent=parent_prompt_configuration.id,
360
+ child=child_prompt_configuration.id,
361
+ module=selected_module_id,
362
+ before=before_mean,
363
+ after=child_score,
364
+ )
365
+ )
366
+
367
+ self.trial_index += 1
368
+ if (
369
+ self.config.full_eval_every is not None
370
+ and self.trial_index % self.config.full_eval_every == 0
371
+ ):
372
+ await self._a_full_evaluate_best(goldens)
373
+
374
+ return True
375
+
376
+ await self._a_run_loop_iteration(_one_iteration)
377
+
378
+ if not self.pareto_score_table:
379
+ await self._a_full_evaluate_best(goldens)
380
+
381
+ best = self._best_by_aggregate()
382
+ prompt_config_snapshots = build_prompt_config_snapshots(
383
+ self.prompt_configurations_by_id
384
+ )
385
+ report = OptimizationResult(
386
+ optimization_id=self.optimization_id,
387
+ best_id=best.id,
388
+ accepted_iterations=accepted_iterations,
389
+ pareto_scores=self.pareto_score_table,
390
+ parents=self.parents_by_id,
391
+ prompt_configurations=prompt_config_snapshots,
392
+ )
393
+ return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
394
+
395
+ ###################
396
+ # State & helpers #
397
+ ###################
398
+
399
+ def reset_state(self) -> None:
400
+ self.optimization_id = str(uuid.uuid4())
401
+ self.prompt_configurations_by_id: Dict[
402
+ PromptConfigurationId, PromptConfiguration
403
+ ] = {}
404
+ self.parents_by_id: Dict[
405
+ PromptConfigurationId, Optional[PromptConfigurationId]
406
+ ] = {}
407
+ # For SIMBA we reuse the same field name as GEPA for full-eval scores.
408
+ self.pareto_score_table: ScoreTable = {}
409
+
410
+ # Surrogate stats: running mean minibatch scores per candidate.
411
+ self._minibatch_score_sums: Dict[PromptConfigurationId, float] = {}
412
+ self._minibatch_score_counts: Dict[PromptConfigurationId, int] = {}
413
+
414
+ # Trial counter (used for full_eval_every).
415
+ self.trial_index: int = 0
416
+
417
+ def _ensure_scoring_adapter(self) -> None:
418
+ if self.scoring_adapter is None:
419
+ raise DeepEvalError(
420
+ "SIMBARunner requires a `scoring_adapter`. "
421
+ "Construct one (for example, DeepEvalScoringAdapter) in "
422
+ "PromptOptimizer and assign it to `runner.scoring_adapter`."
423
+ )
424
+
425
+ def _ensure_rewriter(self) -> None:
426
+ if self._rewriter is not None:
427
+ return
428
+
429
+ # Default basic PromptRewriter; PromptOptimizer can override this and
430
+ # pass a configured instance (e.g. with list-mutation config).
431
+ self._rewriter = PromptRewriter(
432
+ max_chars=self.config.rewrite_instruction_max_chars,
433
+ random_state=self.random_state,
434
+ )
435
+
436
+ def _prompts_equivalent(
437
+ self,
438
+ old_prompt: Prompt,
439
+ new_prompt: Prompt,
440
+ ) -> bool:
441
+ """
442
+ Compare two Prompts for optimization purposes.
443
+
444
+ We treat a child as "no change" if:
445
+ - The types differ, or
446
+ - For TEXT: trimmed text_template matches.
447
+ - For LIST: messages_template length, roles, and trimmed content match.
448
+ """
449
+
450
+ if new_prompt.type == PromptType.LIST:
451
+ old_msgs = old_prompt.messages_template
452
+ new_msgs = new_prompt.messages_template
453
+ if len(old_msgs) != len(new_msgs):
454
+ return False
455
+
456
+ for old_msg, new_msg in zip(old_msgs, new_msgs):
457
+ if old_msg.role != new_msg.role:
458
+ return False
459
+ if (old_msg.content or "").strip() != (
460
+ new_msg.content or ""
461
+ ).strip():
462
+ return False
463
+
464
+ return True
465
+
466
+ old_txt = (old_prompt.text_template or "").strip()
467
+ new_txt = (new_prompt.text_template or "").strip()
468
+ return new_txt == old_txt
469
+
470
+ def _add_prompt_configuration(
471
+ self,
472
+ prompt_configuration: PromptConfiguration,
473
+ ) -> None:
474
+ """
475
+ Add a candidate to the active pool and, if a population limit is set,
476
+ prune the worst-scoring candidates to enforce it.
477
+ """
478
+ self.prompt_configurations_by_id[prompt_configuration.id] = (
479
+ prompt_configuration
480
+ )
481
+ self.parents_by_id[prompt_configuration.id] = (
482
+ prompt_configuration.parent
483
+ )
484
+
485
+ # If we exceed the population size, iteratively prune the worst
486
+ # (by mean minibatch score), never removing the current best.
487
+ while (
488
+ len(self.prompt_configurations_by_id) > self.config.population_size
489
+ ):
490
+ best_id: Optional[PromptConfigurationId] = None
491
+ best_score = float("-inf")
492
+ for cand_id in self.prompt_configurations_by_id.keys():
493
+ mean_score = self._mean_minibatch_score(cand_id)
494
+ if mean_score > best_score:
495
+ best_score = mean_score
496
+ best_id = cand_id
497
+
498
+ worst_id: Optional[PromptConfigurationId] = None
499
+ worst_score = float("inf")
500
+ for cand_id in self.prompt_configurations_by_id.keys():
501
+ if cand_id == best_id:
502
+ continue
503
+ mean_score = self._mean_minibatch_score(cand_id)
504
+ if mean_score < worst_score:
505
+ worst_score = mean_score
506
+ worst_id = cand_id
507
+
508
+ if worst_id is None or worst_id == best_id:
509
+ break
510
+
511
+ # Prune the chosen worst candidate from all bookkeeping tables.
512
+ self.prompt_configurations_by_id.pop(worst_id, None)
513
+ self.parents_by_id.pop(worst_id, None)
514
+ self._minibatch_score_sums.pop(worst_id, None)
515
+ self._minibatch_score_counts.pop(worst_id, None)
516
+ self.pareto_score_table.pop(worst_id, None)
517
+
518
+ def _record_minibatch_score(
519
+ self,
520
+ prompt_configuration_id: PromptConfigurationId,
521
+ score: float,
522
+ ) -> None:
523
+ self._minibatch_score_sums[prompt_configuration_id] = (
524
+ self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
525
+ + float(score)
526
+ )
527
+ self._minibatch_score_counts[prompt_configuration_id] = (
528
+ self._minibatch_score_counts.get(prompt_configuration_id, 0) + 1
529
+ )
530
+
531
+ def _mean_minibatch_score(
532
+ self,
533
+ prompt_configuration_id: PromptConfigurationId,
534
+ ) -> float:
535
+ total = self._minibatch_score_sums.get(prompt_configuration_id, 0.0)
536
+ count = self._minibatch_score_counts.get(prompt_configuration_id, 0)
537
+ if count <= 0:
538
+ # Use a sentinel that will not dominate selection if a scored
539
+ # candidate exists. Root is seeded explicitly in the first iteration.
540
+ return float("-inf")
541
+ return total / count
542
+
543
+ def _best_by_minibatch(self) -> PromptConfiguration:
544
+ """
545
+ Return the candidate with the highest mean minibatch score.
546
+ """
547
+ if not self.prompt_configurations_by_id:
548
+ raise DeepEvalError(
549
+ "SIMBARunner has no prompt configurations; this should not happen."
550
+ )
551
+
552
+ best_id: Optional[PromptConfigurationId] = None
553
+ best_score = float("-inf")
554
+
555
+ for cand_id in self.prompt_configurations_by_id.keys():
556
+ mean_score = self._mean_minibatch_score(cand_id)
557
+ if mean_score > best_score:
558
+ best_score = mean_score
559
+ best_id = cand_id
560
+
561
+ if best_id is None:
562
+ # Fallback to the first candidate if all means are -inf.
563
+ best_id = next(iter(self.prompt_configurations_by_id.keys()))
564
+
565
+ return self.prompt_configurations_by_id[best_id]
566
+
567
+ def _best_by_aggregate(self) -> PromptConfiguration:
568
+ """
569
+ Return the best candidate based on full-eval scores.
570
+
571
+ If no full evaluation scores are available (should be rare, but possible if
572
+ full_eval_every is very large and the loop exits early), fall back to
573
+ best-by-minibatch.
574
+ """
575
+ if not self.pareto_score_table:
576
+ return self._best_by_minibatch()
577
+
578
+ totals = {
579
+ prompt_configuration_id: self.aggregate_instances(vector)
580
+ for prompt_configuration_id, vector in self.pareto_score_table.items()
581
+ }
582
+
583
+ best_ids: List[PromptConfigurationId] = []
584
+ best_val = float("-inf")
585
+
586
+ for cand_id, aggregate in totals.items():
587
+ if aggregate > best_val + 1e-12:
588
+ best_val = aggregate
589
+ best_ids = [cand_id]
590
+ elif abs(aggregate - best_val) <= 1e-12:
591
+ best_ids.append(cand_id)
592
+
593
+ chosen_id = self.random_state.choice(best_ids)
594
+ return self.prompt_configurations_by_id[chosen_id]
595
+
596
+ def _select_candidate(self) -> PromptConfiguration:
597
+ """
598
+ Epsilon-greedy candidate selection:
599
+
600
+ - With probability ``exploration_probability``, pick a random candidate.
601
+ - Otherwise, pick the candidate with the highest mean minibatch score.
602
+ """
603
+ if not self.prompt_configurations_by_id:
604
+ raise DeepEvalError(
605
+ "SIMBARunner has no prompt configurations to select from."
606
+ )
607
+
608
+ candidate_ids = list(self.prompt_configurations_by_id.keys())
609
+ if not candidate_ids:
610
+ raise DeepEvalError(
611
+ "SIMBARunner has an empty candidate pool; this should not happen."
612
+ )
613
+
614
+ eps = float(self.config.exploration_probability)
615
+ if eps > 0.0 and self.random_state.random() < eps:
616
+ chosen_id = self.random_state.choice(candidate_ids)
617
+ else:
618
+ chosen_id = self._best_by_minibatch().id
619
+
620
+ return self.prompt_configurations_by_id[chosen_id]
621
+
622
+ def _draw_minibatch(
623
+ self,
624
+ goldens: Union[List[Golden], List[ConversationalGolden]],
625
+ ) -> Union[List[Golden], List[ConversationalGolden]]:
626
+ """
627
+ Determine effective minibatch size from SIMBAConfig, bounded by the
628
+ available goldens, and sample with replacement.
629
+ """
630
+ n = len(goldens)
631
+ if n <= 0:
632
+ return []
633
+
634
+ if self.config.minibatch_size is not None:
635
+ size = self.config.minibatch_size
636
+ else:
637
+ dynamic = max(1, int(round(n * self.config.minibatch_ratio)))
638
+ size = max(
639
+ self.config.minibatch_min_size,
640
+ min(dynamic, self.config.minibatch_max_size),
641
+ )
642
+
643
+ size = max(1, min(size, n))
644
+
645
+ return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
646
+
647
+ async def _a_full_evaluate_best(
648
+ self,
649
+ goldens: Union[List[Golden], List[ConversationalGolden]],
650
+ ) -> None:
651
+ if not self.prompt_configurations_by_id:
652
+ return
653
+
654
+ best = self._best_by_minibatch()
655
+ if best.id in self.pareto_score_table:
656
+ return
657
+
658
+ scores = await self.scoring_adapter.a_score_on_pareto(best, goldens)
659
+ self.pareto_score_table[best.id] = scores
660
+
661
+ def _full_evaluate_best(
662
+ self,
663
+ goldens: Union[List[Golden], List[ConversationalGolden]],
664
+ ) -> None:
665
+ if not self.prompt_configurations_by_id:
666
+ return
667
+
668
+ best = self._best_by_minibatch()
669
+ if best.id in self.pareto_score_table:
670
+ return
671
+
672
+ scores = self.scoring_adapter.score_on_pareto(best, goldens)
673
+ self.pareto_score_table[best.id] = scores
674
+
675
+ async def _a_generate_child_prompt(
676
+ self,
677
+ strategy: SIMBAStrategy,
678
+ selected_module_id: ModuleId,
679
+ parent_prompt_configuration: PromptConfiguration,
680
+ feedback_text: str,
681
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
682
+ ) -> Optional[Prompt]:
683
+ try:
684
+ old_prompt = parent_prompt_configuration.prompts[selected_module_id]
685
+ except KeyError as exc:
686
+ raise DeepEvalError(
687
+ "SIMBARunner expected a prompt for module_id "
688
+ f"{selected_module_id!r} but none was found in the "
689
+ "current prompt configuration."
690
+ ) from exc
691
+
692
+ strategy_feedback = self._build_feedback_for_strategy(
693
+ strategy, feedback_text, minibatch
694
+ )
695
+
696
+ new_prompt = await self._rewriter.a_rewrite(
697
+ model_callback=self.model_callback,
698
+ module_id=selected_module_id,
699
+ old_prompt=old_prompt,
700
+ feedback_text=strategy_feedback,
701
+ )
702
+
703
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
704
+ old_prompt, new_prompt
705
+ ):
706
+ # Don't accept if new prompt is the same as parent, or if type changed.
707
+ return None
708
+ return new_prompt
709
+
710
+ def _generate_child_prompt(
711
+ self,
712
+ strategy: SIMBAStrategy,
713
+ selected_module_id: ModuleId,
714
+ parent_prompt_configuration: PromptConfiguration,
715
+ feedback_text: str,
716
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
717
+ ) -> Optional[Prompt]:
718
+ try:
719
+ old_prompt = parent_prompt_configuration.prompts[selected_module_id]
720
+ except KeyError as exc:
721
+ # This should never happen in normal operation.
722
+ raise DeepEvalError(
723
+ "SIMBARunner expected a prompt for module_id "
724
+ f"{selected_module_id!r} but none was found in the "
725
+ "current prompt configuration."
726
+ ) from exc
727
+
728
+ strategy_feedback = self._build_feedback_for_strategy(
729
+ strategy, feedback_text, minibatch
730
+ )
731
+
732
+ new_prompt = self._rewriter.rewrite(
733
+ model_callback=self.model_callback,
734
+ module_id=selected_module_id,
735
+ old_prompt=old_prompt,
736
+ feedback_text=strategy_feedback,
737
+ )
738
+
739
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
740
+ old_prompt, new_prompt
741
+ ):
742
+ # Don't accept if new prompt is the same as parent, or if type changed.
743
+ return None
744
+ return new_prompt
745
+
746
+ def _make_child(
747
+ self,
748
+ selected_module_id: ModuleId,
749
+ parent_prompt_configuration: PromptConfiguration,
750
+ child_prompt: Prompt,
751
+ ) -> PromptConfiguration:
752
+ child_prompt_configuration = PromptConfiguration.new(
753
+ prompts=dict(parent_prompt_configuration.prompts),
754
+ parent=parent_prompt_configuration.id,
755
+ )
756
+ child_prompt_configuration.prompts[selected_module_id] = child_prompt
757
+ return child_prompt_configuration
758
+
759
+ def _truncate_instruction(self, text: str) -> str:
760
+ """
761
+ Truncate strategy instructions + feedback to the configured character
762
+ budget so the rewriter prompt does not explode.
763
+ """
764
+ max_chars = self.config.rewrite_instruction_max_chars
765
+ if max_chars <= 0:
766
+ return text
767
+ if len(text) <= max_chars:
768
+ return text
769
+ return text[:max_chars]
770
+
771
+ def _build_demo_block(
772
+ self,
773
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
774
+ ) -> str:
775
+ """
776
+ Build a small block of input/context/output demos from the current
777
+ minibatch, inspired by SIMBA's `append_a_demo` strategy.
778
+
779
+ For each Golden:
780
+
781
+ Golden:
782
+ Input <- golden.input
783
+ Context <- " ".join(golden.context) if present
784
+ Output <- golden.expected_output
785
+
786
+ ConversationalGolden:
787
+ Input <- golden.scenario
788
+ Context <- " ".join(golden.context) if present
789
+ Output <- golden.expected_outcome
790
+
791
+ All text segments are independently truncated to `demo_input_max_chars`.
792
+ """
793
+ max_demos = self.config.max_demos_per_proposal
794
+ if max_demos <= 0:
795
+ return ""
796
+
797
+ lines: List[str] = []
798
+ demo_limit = min(max_demos, len(minibatch))
799
+ max_chars = self.config.demo_input_max_chars
800
+
801
+ for golden in minibatch[:demo_limit]:
802
+ if isinstance(golden, Golden):
803
+ input_text = golden.input or ""
804
+ expected_output_text = golden.expected_output or ""
805
+ ctx_list = golden.context or []
806
+ elif isinstance(golden, ConversationalGolden):
807
+ input_text = golden.scenario or ""
808
+ expected_output_text = golden.expected_outcome or ""
809
+ ctx_list = golden.context or []
810
+ else:
811
+ # Unknown type; skip defensively
812
+ continue
813
+
814
+ context_text = " ".join(ctx_list) if ctx_list else ""
815
+
816
+ # Skip completely empty triples
817
+ if not input_text and not expected_output_text and not context_text:
818
+ continue
819
+
820
+ # Truncate each segment independently
821
+ if max_chars > 0:
822
+ if len(input_text) > max_chars:
823
+ input_text = input_text[:max_chars]
824
+ if len(context_text) > max_chars:
825
+ context_text = context_text[:max_chars]
826
+ if len(expected_output_text) > max_chars:
827
+ expected_output_text = expected_output_text[:max_chars]
828
+
829
+ demo_lines: List[str] = [f"Input: {input_text}"]
830
+ if context_text:
831
+ demo_lines.append(f"Context: {context_text}")
832
+ demo_lines.append(f"Output: {expected_output_text}")
833
+
834
+ lines.append("\n".join(demo_lines))
835
+
836
+ return "\n\n".join(lines)
837
+
838
+ def _build_feedback_for_strategy(
839
+ self,
840
+ strategy: SIMBAStrategy,
841
+ feedback_text: str,
842
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
843
+ ) -> str:
844
+ """
845
+ Construct a strategy-specific feedback string that is passed into
846
+ PromptRewriter.rewrite / a_rewrite.
847
+
848
+ - APPEND_RULE: emphasize extracting a concise rule from metric feedback.
849
+ - APPEND_DEMO: emphasize appending concrete demos built from goldens.
850
+ """
851
+ base = (feedback_text or "").strip()
852
+
853
+ if strategy is SIMBAStrategy.APPEND_RULE:
854
+ prefix = (
855
+ "Strategy: Append a concise natural-language rule to the existing "
856
+ "prompt that addresses the issues described below. Preserve all "
857
+ "original instructions and add the new rule(s) in a clearly marked "
858
+ '"Rules" or "Guidelines" section.\n\n'
859
+ )
860
+ text = prefix
861
+ if base:
862
+ text += "Evaluation feedback:\n" + base
863
+ return self._truncate_instruction(text)
864
+
865
+ if strategy is SIMBAStrategy.APPEND_DEMO:
866
+ demos = self._build_demo_block(minibatch)
867
+ prefix = (
868
+ "Strategy: Append one or more concrete input/output demonstrations "
869
+ "to the prompt. Each demo should illustrate how to respond "
870
+ "correctly on similar inputs.\n\n"
871
+ )
872
+ text = prefix
873
+ if base:
874
+ text += "Evaluation feedback:\n" + base + "\n\n"
875
+ if demos:
876
+ text += (
877
+ "Candidate demos built from the current minibatch:\n"
878
+ + demos
879
+ )
880
+ return self._truncate_instruction(text)
881
+
882
+ # just pass through feedback.
883
+ return self._truncate_instruction(base)
884
+
885
+ def _sample_strategy(self) -> SIMBAStrategy:
886
+ """
887
+ Sample one of the configured SIMBA edit strategies.
888
+
889
+ Defaults to APPEND_RULE if the strategy list is empty for any reason.
890
+ """
891
+ return self.random_state.choice(self._strategies)
892
+
893
+ def _update_progress(
894
+ self,
895
+ total_iterations: int,
896
+ iteration: int,
897
+ remaining_iterations: int,
898
+ elapsed: float,
899
+ ) -> None:
900
+ if self.status_callback is not None:
901
+ detail = (
902
+ f"(iterations={total_iterations}) "
903
+ f"• iteration {iteration}/{total_iterations} "
904
+ f"• {elapsed:.2f}s • remaining={remaining_iterations}"
905
+ )
906
+ self.status_callback(
907
+ RunnerStatusType.PROGRESS,
908
+ step_index=iteration,
909
+ total_steps=total_iterations,
910
+ detail=detail,
911
+ )
912
+
913
+ def _update_error(
914
+ self,
915
+ total_iterations: int,
916
+ iteration: int,
917
+ exc: Exception,
918
+ ) -> None:
919
+ # Report a user-facing error event.
920
+ if self.status_callback is not None:
921
+ detail = (
922
+ f"(iterations={total_iterations}) "
923
+ f"• error {exc.__class__.__name__}: {exc} "
924
+ f"• halted at iteration {iteration}"
925
+ )
926
+ self.status_callback(
927
+ RunnerStatusType.ERROR,
928
+ step_index=iteration,
929
+ total_steps=total_iterations,
930
+ detail=detail,
931
+ )
932
+
933
+ def _run_loop_iteration(
934
+ self,
935
+ simba_iteration: Callable[[], bool],
936
+ ) -> None:
937
+ total_iterations = self.config.iterations
938
+ remaining_iterations = total_iterations
939
+ iteration = 0
940
+ self._update_progress(
941
+ total_iterations, iteration, remaining_iterations, 0.0
942
+ )
943
+ while remaining_iterations > 0:
944
+ iteration += 1
945
+ start_time = time.perf_counter()
946
+ try:
947
+ ok = simba_iteration()
948
+ except Exception as exc:
949
+ self._update_error(total_iterations, iteration, exc)
950
+ break
951
+ elapsed = time.perf_counter() - start_time
952
+ if not ok:
953
+ break
954
+ remaining_iterations -= 1
955
+ self._update_progress(
956
+ total_iterations, iteration, remaining_iterations, elapsed
957
+ )
958
+
959
+ async def _a_run_loop_iteration(
960
+ self,
961
+ a_simba_iteration: Callable[[], Awaitable[bool]],
962
+ ) -> None:
963
+ total_iterations = self.config.iterations
964
+ remaining_iterations = total_iterations
965
+ iteration = 0
966
+ self._update_progress(
967
+ total_iterations, iteration, remaining_iterations, 0.0
968
+ )
969
+ while remaining_iterations > 0:
970
+ iteration += 1
971
+ start_time = time.perf_counter()
972
+ try:
973
+ ok = await a_simba_iteration()
974
+ except Exception as exc:
975
+ self._update_error(total_iterations, iteration, exc)
976
+ break
977
+ elapsed = time.perf_counter() - start_time
978
+ if not ok:
979
+ break
980
+ remaining_iterations -= 1
981
+ self._update_progress(
982
+ total_iterations, iteration, remaining_iterations, elapsed
983
+ )