deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/execute.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +4 -4
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/template.py +6 -6
- deepeval/metrics/contextual_recall/template.py +2 -2
- deepeval/metrics/contextual_relevancy/template.py +3 -3
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +4 -4
- deepeval/metrics/faithfulness/template.py +4 -4
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/models/embedding_models/azure_embedding_model.py +28 -15
- deepeval/models/embedding_models/local_embedding_model.py +23 -10
- deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
- deepeval/models/embedding_models/openai_embedding_model.py +18 -2
- deepeval/models/llms/anthropic_model.py +17 -5
- deepeval/models/llms/azure_model.py +30 -18
- deepeval/models/llms/deepseek_model.py +22 -12
- deepeval/models/llms/gemini_model.py +120 -87
- deepeval/models/llms/grok_model.py +23 -16
- deepeval/models/llms/kimi_model.py +23 -12
- deepeval/models/llms/litellm_model.py +63 -25
- deepeval/models/llms/local_model.py +26 -18
- deepeval/models/llms/ollama_model.py +17 -7
- deepeval/models/llms/openai_model.py +22 -17
- deepeval/models/llms/portkey_model.py +132 -0
- deepeval/models/mlllms/azure_model.py +28 -19
- deepeval/models/mlllms/gemini_model.py +102 -73
- deepeval/models/mlllms/ollama_model.py +40 -9
- deepeval/models/mlllms/openai_model.py +65 -14
- deepeval/models/utils.py +48 -3
- deepeval/optimization/__init__.py +13 -0
- deepeval/optimization/adapters/__init__.py +2 -0
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
- deepeval/optimization/aggregates.py +14 -0
- deepeval/optimization/configs.py +34 -0
- deepeval/optimization/copro/configs.py +31 -0
- deepeval/optimization/copro/loop.py +837 -0
- deepeval/optimization/gepa/__init__.py +7 -0
- deepeval/optimization/gepa/configs.py +115 -0
- deepeval/optimization/gepa/loop.py +677 -0
- deepeval/optimization/miprov2/configs.py +134 -0
- deepeval/optimization/miprov2/loop.py +785 -0
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +458 -0
- deepeval/optimization/policies/__init__.py +16 -0
- deepeval/optimization/policies/selection.py +166 -0
- deepeval/optimization/policies/tie_breaker.py +67 -0
- deepeval/optimization/prompt_optimizer.py +462 -0
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +33 -0
- deepeval/optimization/simba/loop.py +983 -0
- deepeval/optimization/simba/types.py +15 -0
- deepeval/optimization/types.py +361 -0
- deepeval/optimization/utils.py +598 -0
- deepeval/prompt/prompt.py +10 -5
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +24 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import uuid
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from typing import (
|
|
7
|
+
Awaitable,
|
|
8
|
+
Callable,
|
|
9
|
+
Dict,
|
|
10
|
+
List,
|
|
11
|
+
Tuple,
|
|
12
|
+
TYPE_CHECKING,
|
|
13
|
+
Union,
|
|
14
|
+
Optional,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from deepeval.errors import DeepEvalError
|
|
18
|
+
from deepeval.optimization.aggregates import Aggregator, mean_of_all
|
|
19
|
+
from deepeval.optimization.types import (
|
|
20
|
+
AcceptedIterationDict,
|
|
21
|
+
PromptConfiguration,
|
|
22
|
+
PromptConfigurationId,
|
|
23
|
+
ModuleId,
|
|
24
|
+
ScoreTable,
|
|
25
|
+
ScoringAdapter,
|
|
26
|
+
OptimizationResult,
|
|
27
|
+
RunnerStatusType,
|
|
28
|
+
RunnerStatusCallbackProtocol,
|
|
29
|
+
)
|
|
30
|
+
from deepeval.optimization.utils import (
|
|
31
|
+
split_goldens,
|
|
32
|
+
build_prompt_config_snapshots,
|
|
33
|
+
)
|
|
34
|
+
from deepeval.optimization.policies import (
|
|
35
|
+
pick_best_with_ties,
|
|
36
|
+
select_prompt_configuration_pareto,
|
|
37
|
+
)
|
|
38
|
+
from deepeval.prompt.api import PromptType
|
|
39
|
+
from deepeval.prompt.prompt import Prompt
|
|
40
|
+
from deepeval.optimization.mutations.prompt_rewriter import (
|
|
41
|
+
PromptRewriter,
|
|
42
|
+
)
|
|
43
|
+
from .configs import GEPAConfig
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if TYPE_CHECKING:
|
|
47
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class GEPARunner:
|
|
51
|
+
"""
|
|
52
|
+
GEPA loop with sync/async execution.
|
|
53
|
+
|
|
54
|
+
This runner is intentionally low level and does not know about metrics,
|
|
55
|
+
models, or async configs. It relies on a preconfigured
|
|
56
|
+
ScoringAdapter and PromptRewriter, which are typically constructed by
|
|
57
|
+
the higher-level PromptOptimizer.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
SINGLE_MODULE_ID: ModuleId = "__module__"
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
*,
|
|
65
|
+
config: GEPAConfig,
|
|
66
|
+
aggregate_instances: Aggregator = mean_of_all,
|
|
67
|
+
scoring_adapter: Optional[ScoringAdapter] = None,
|
|
68
|
+
) -> None:
|
|
69
|
+
self.config = config
|
|
70
|
+
self.aggregate_instances = aggregate_instances
|
|
71
|
+
self.scoring_adapter = scoring_adapter
|
|
72
|
+
|
|
73
|
+
# random seeded from config is used for splits, sampling, and tie-breaking.
|
|
74
|
+
self.random_state = random.Random(config.random_seed)
|
|
75
|
+
|
|
76
|
+
# runtime state to be reset between runs
|
|
77
|
+
self.reset_state()
|
|
78
|
+
|
|
79
|
+
# Status callback set by PromptOptimizer:
|
|
80
|
+
# (kind, step_index, total_steps, detail) -> None
|
|
81
|
+
self.status_callback: Optional[RunnerStatusCallbackProtocol] = None
|
|
82
|
+
|
|
83
|
+
# Model callback used by the rewriter set by PromptOptimizer.
|
|
84
|
+
self.model_callback: Optional[
|
|
85
|
+
Callable[
|
|
86
|
+
...,
|
|
87
|
+
Union[
|
|
88
|
+
str,
|
|
89
|
+
Dict,
|
|
90
|
+
Tuple[Union[str, Dict], float],
|
|
91
|
+
],
|
|
92
|
+
]
|
|
93
|
+
] = None
|
|
94
|
+
|
|
95
|
+
# lazy loaded
|
|
96
|
+
self._rewriter: Optional[PromptRewriter] = None
|
|
97
|
+
|
|
98
|
+
##############
|
|
99
|
+
# Public API #
|
|
100
|
+
##############
|
|
101
|
+
|
|
102
|
+
def execute(
|
|
103
|
+
self,
|
|
104
|
+
*,
|
|
105
|
+
prompt: Prompt,
|
|
106
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
107
|
+
) -> Tuple[Prompt, Dict]:
|
|
108
|
+
"""Synchronous GEPA run from a full list of goldens (splits internally)."""
|
|
109
|
+
total_goldens = len(goldens)
|
|
110
|
+
if total_goldens < 2:
|
|
111
|
+
raise DeepEvalError(
|
|
112
|
+
"GEPA prompt optimization requires at least 2 goldens, but "
|
|
113
|
+
f"received {total_goldens}. Provide at least two goldens to "
|
|
114
|
+
"run the optimizer."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self._ensure_scoring_adapter()
|
|
118
|
+
self._ensure_rewriter()
|
|
119
|
+
self.reset_state()
|
|
120
|
+
|
|
121
|
+
d_feedback, d_pareto = split_goldens(
|
|
122
|
+
goldens, self.config.pareto_size, random_state=self.random_state
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
|
|
126
|
+
root_prompt_configuration = PromptConfiguration.new(
|
|
127
|
+
prompts=dict(seed_prompts_by_module)
|
|
128
|
+
)
|
|
129
|
+
self._add_prompt_configuration(root_prompt_configuration)
|
|
130
|
+
|
|
131
|
+
accepted_iterations: List[Dict] = []
|
|
132
|
+
|
|
133
|
+
def _one_iteration() -> bool:
|
|
134
|
+
nonlocal accepted_iterations
|
|
135
|
+
|
|
136
|
+
if not d_feedback:
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
# Seed Pareto scores lazily on first iteration
|
|
140
|
+
if not self.pareto_score_table:
|
|
141
|
+
self.pareto_score_table[root_prompt_configuration.id] = (
|
|
142
|
+
self.scoring_adapter.score_on_pareto(
|
|
143
|
+
root_prompt_configuration, d_pareto
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# 1. Pick prompt_configuration via Pareto
|
|
148
|
+
parent_prompt_configuration = self._pick_prompt_configuration()
|
|
149
|
+
|
|
150
|
+
# 2. Single module id
|
|
151
|
+
selected_module_id: ModuleId = self.SINGLE_MODULE_ID
|
|
152
|
+
|
|
153
|
+
# 3. Draw minibatch
|
|
154
|
+
minibatch = self._draw_minibatch(d_feedback)
|
|
155
|
+
|
|
156
|
+
# 4. Feedback
|
|
157
|
+
feedback_text = self.scoring_adapter.minibatch_feedback(
|
|
158
|
+
parent_prompt_configuration, selected_module_id, minibatch
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# 5. Rewrite
|
|
162
|
+
child_prompt = self._generate_child_prompt(
|
|
163
|
+
selected_module_id, parent_prompt_configuration, feedback_text
|
|
164
|
+
)
|
|
165
|
+
if child_prompt is None:
|
|
166
|
+
# Child prompt matched parent; skip this iteration.
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
# 6. Child prompt_configuration
|
|
170
|
+
child_prompt_configuration = self._make_child(
|
|
171
|
+
selected_module_id, parent_prompt_configuration, child_prompt
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# 7. Evaluate parent/child on minibatch
|
|
175
|
+
parent_score = self.scoring_adapter.minibatch_score(
|
|
176
|
+
parent_prompt_configuration, minibatch
|
|
177
|
+
)
|
|
178
|
+
child_score = self.scoring_adapter.minibatch_score(
|
|
179
|
+
child_prompt_configuration, minibatch
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# 8. Acceptance test
|
|
183
|
+
if self._should_accept_child(parent_score, child_score):
|
|
184
|
+
accepted_iterations.append(
|
|
185
|
+
self._accept_child(
|
|
186
|
+
selected_module_id,
|
|
187
|
+
parent_prompt_configuration,
|
|
188
|
+
child_prompt_configuration,
|
|
189
|
+
d_pareto,
|
|
190
|
+
parent_score,
|
|
191
|
+
child_score,
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return True
|
|
196
|
+
|
|
197
|
+
self._run_loop_iteration(_one_iteration)
|
|
198
|
+
best = self._best_by_aggregate()
|
|
199
|
+
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
200
|
+
self.prompt_configurations_by_id
|
|
201
|
+
)
|
|
202
|
+
report = OptimizationResult(
|
|
203
|
+
optimization_id=self.optimization_id,
|
|
204
|
+
best_id=best.id,
|
|
205
|
+
accepted_iterations=accepted_iterations,
|
|
206
|
+
pareto_scores=self.pareto_score_table,
|
|
207
|
+
parents=self.parents_by_id,
|
|
208
|
+
prompt_configurations=prompt_config_snapshots,
|
|
209
|
+
)
|
|
210
|
+
return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
|
|
211
|
+
|
|
212
|
+
async def a_execute(
|
|
213
|
+
self,
|
|
214
|
+
*,
|
|
215
|
+
prompt: Prompt,
|
|
216
|
+
goldens: Union[List["Golden"], List["ConversationalGolden"]],
|
|
217
|
+
) -> Tuple[Prompt, Dict]:
|
|
218
|
+
"""Asynchronous twin of execute_gepa()."""
|
|
219
|
+
total_goldens = len(goldens)
|
|
220
|
+
if total_goldens < 2:
|
|
221
|
+
raise DeepEvalError(
|
|
222
|
+
"GEPA prompt optimization requires at least 2 goldens, but "
|
|
223
|
+
f"received {total_goldens}. Provide at least two goldens to "
|
|
224
|
+
"run the optimizer."
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
self._ensure_scoring_adapter()
|
|
228
|
+
self._ensure_rewriter()
|
|
229
|
+
self.reset_state()
|
|
230
|
+
|
|
231
|
+
d_feedback, d_pareto = split_goldens(
|
|
232
|
+
goldens, self.config.pareto_size, random_state=self.random_state
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
|
|
236
|
+
root_prompt_configuration = PromptConfiguration.new(
|
|
237
|
+
prompts=dict(seed_prompts_by_module)
|
|
238
|
+
)
|
|
239
|
+
self._add_prompt_configuration(root_prompt_configuration)
|
|
240
|
+
|
|
241
|
+
accepted_iterations: List[Dict] = []
|
|
242
|
+
|
|
243
|
+
async def _one_iteration() -> bool:
|
|
244
|
+
nonlocal accepted_iterations
|
|
245
|
+
|
|
246
|
+
if not d_feedback:
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
# Seed Pareto scores lazily on first iteration
|
|
250
|
+
if not self.pareto_score_table:
|
|
251
|
+
self.pareto_score_table[root_prompt_configuration.id] = (
|
|
252
|
+
await self.scoring_adapter.a_score_on_pareto(
|
|
253
|
+
root_prompt_configuration, d_pareto
|
|
254
|
+
)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# 1. Pick prompt_configuration via Pareto
|
|
258
|
+
parent_prompt_configuration = self._pick_prompt_configuration()
|
|
259
|
+
|
|
260
|
+
# 2. Single module id
|
|
261
|
+
selected_module_id: ModuleId = self.SINGLE_MODULE_ID
|
|
262
|
+
|
|
263
|
+
# 3. Draw minibatch
|
|
264
|
+
minibatch = self._draw_minibatch(d_feedback)
|
|
265
|
+
|
|
266
|
+
# 4. Feedback
|
|
267
|
+
feedback_text = await self.scoring_adapter.a_minibatch_feedback(
|
|
268
|
+
parent_prompt_configuration, selected_module_id, minibatch
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# 5. Rewrite
|
|
272
|
+
child_prompt = await self._a_generate_child_prompt(
|
|
273
|
+
selected_module_id, parent_prompt_configuration, feedback_text
|
|
274
|
+
)
|
|
275
|
+
if child_prompt is None:
|
|
276
|
+
# Child prompt matched parent; skip this iteration.
|
|
277
|
+
return True
|
|
278
|
+
|
|
279
|
+
# 6. Child prompt_configuration
|
|
280
|
+
child_prompt_configuration = self._make_child(
|
|
281
|
+
selected_module_id, parent_prompt_configuration, child_prompt
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# 7. Evaluate parent/child on minibatch
|
|
285
|
+
parent_score = await self.scoring_adapter.a_minibatch_score(
|
|
286
|
+
parent_prompt_configuration, minibatch
|
|
287
|
+
)
|
|
288
|
+
child_score = await self.scoring_adapter.a_minibatch_score(
|
|
289
|
+
child_prompt_configuration, minibatch
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# 8. Acceptance test
|
|
293
|
+
if self._should_accept_child(parent_score, child_score):
|
|
294
|
+
accepted_iterations.append(
|
|
295
|
+
await self._a_accept_child(
|
|
296
|
+
selected_module_id,
|
|
297
|
+
parent_prompt_configuration,
|
|
298
|
+
child_prompt_configuration,
|
|
299
|
+
d_pareto,
|
|
300
|
+
parent_score,
|
|
301
|
+
child_score,
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
return True
|
|
305
|
+
|
|
306
|
+
await self._a_run_loop_iteration(_one_iteration)
|
|
307
|
+
best = self._best_by_aggregate()
|
|
308
|
+
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
309
|
+
self.prompt_configurations_by_id
|
|
310
|
+
)
|
|
311
|
+
report = OptimizationResult(
|
|
312
|
+
optimization_id=self.optimization_id,
|
|
313
|
+
best_id=best.id,
|
|
314
|
+
accepted_iterations=accepted_iterations,
|
|
315
|
+
pareto_scores=self.pareto_score_table,
|
|
316
|
+
parents=self.parents_by_id,
|
|
317
|
+
prompt_configurations=prompt_config_snapshots,
|
|
318
|
+
)
|
|
319
|
+
return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
|
|
320
|
+
|
|
321
|
+
###################
|
|
322
|
+
# State & helpers #
|
|
323
|
+
###################
|
|
324
|
+
|
|
325
|
+
def reset_state(self) -> None:
|
|
326
|
+
self.optimization_id = str(uuid.uuid4())
|
|
327
|
+
self.prompt_configurations_by_id: Dict[
|
|
328
|
+
PromptConfigurationId, PromptConfiguration
|
|
329
|
+
] = {}
|
|
330
|
+
self.parents_by_id: Dict[
|
|
331
|
+
PromptConfigurationId, Optional[PromptConfigurationId]
|
|
332
|
+
] = {}
|
|
333
|
+
self.pareto_score_table: ScoreTable = {}
|
|
334
|
+
|
|
335
|
+
def _ensure_scoring_adapter(self) -> None:
|
|
336
|
+
if self.scoring_adapter is None:
|
|
337
|
+
raise DeepEvalError(
|
|
338
|
+
"GEPARunner requires a `scoring_adapter`. "
|
|
339
|
+
"Construct one (for example, DeepEvalScoringAdapter) in "
|
|
340
|
+
"PromptOptimizer and assign it to `runner.scoring_adapter`."
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
def _ensure_rewriter(self) -> None:
|
|
344
|
+
if self._rewriter is not None:
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
# For now, always use the basic PromptRewriter. Additional
|
|
348
|
+
# variants (e.g. for GEPA Alg. 4 crossover) can be introduced
|
|
349
|
+
# later
|
|
350
|
+
self._rewriter = PromptRewriter()
|
|
351
|
+
|
|
352
|
+
def _prompts_equivalent(
|
|
353
|
+
self, old_prompt: Prompt, new_prompt: Prompt
|
|
354
|
+
) -> bool:
|
|
355
|
+
"""
|
|
356
|
+
Compare two Prompts for GEPA acceptance purposes.
|
|
357
|
+
|
|
358
|
+
This is used as:
|
|
359
|
+
if self._prompts_equivalent(old, new):
|
|
360
|
+
# reject child (treat as "no change")
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
So:
|
|
364
|
+
- Return True: "do not accept this child"
|
|
365
|
+
- Return False: "child is meaningfully different"
|
|
366
|
+
|
|
367
|
+
Rules:
|
|
368
|
+
- If the types must be the same for this check to be meaningful
|
|
369
|
+
- For TEXT: compare text_template with whitespace trimmed
|
|
370
|
+
- For LIST: compare messages_template (length, role, and content,
|
|
371
|
+
with content whitespace trimmed).
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
# LIST prompts: compare messages
|
|
375
|
+
if new_prompt.type == PromptType.LIST:
|
|
376
|
+
old_msgs = old_prompt.messages_template
|
|
377
|
+
new_msgs = new_prompt.messages_template
|
|
378
|
+
if len(old_msgs) != len(new_msgs):
|
|
379
|
+
return False
|
|
380
|
+
|
|
381
|
+
for old_msg, new_msg in zip(old_msgs, new_msgs):
|
|
382
|
+
if old_msg.role != new_msg.role:
|
|
383
|
+
return False
|
|
384
|
+
if (old_msg.content or "").strip() != (
|
|
385
|
+
new_msg.content or ""
|
|
386
|
+
).strip():
|
|
387
|
+
return False
|
|
388
|
+
|
|
389
|
+
return True
|
|
390
|
+
|
|
391
|
+
# TEXT prompts: compare text_template
|
|
392
|
+
old_txt = (old_prompt.text_template or "").strip()
|
|
393
|
+
new_txt = (new_prompt.text_template or "").strip()
|
|
394
|
+
return new_txt == old_txt
|
|
395
|
+
|
|
396
|
+
def _add_prompt_configuration(
|
|
397
|
+
self, prompt_configuration: PromptConfiguration
|
|
398
|
+
) -> None:
|
|
399
|
+
self.prompt_configurations_by_id[prompt_configuration.id] = (
|
|
400
|
+
prompt_configuration
|
|
401
|
+
)
|
|
402
|
+
self.parents_by_id[prompt_configuration.id] = (
|
|
403
|
+
prompt_configuration.parent
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
def _best_by_aggregate(self) -> PromptConfiguration:
|
|
407
|
+
totals = {
|
|
408
|
+
prompt_configuration_id: self.aggregate_instances(vector)
|
|
409
|
+
for prompt_configuration_id, vector in self.pareto_score_table.items()
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
chosen, tied, max_val = pick_best_with_ties(
|
|
413
|
+
totals,
|
|
414
|
+
self.parents_by_id,
|
|
415
|
+
random_state=self.random_state,
|
|
416
|
+
tie_tolerance=float(self.config.tie_tolerance),
|
|
417
|
+
policy=self.config.tie_breaker,
|
|
418
|
+
)
|
|
419
|
+
if self.status_callback is not None and len(tied) > 1:
|
|
420
|
+
msg = (
|
|
421
|
+
f"tie on aggregate={max_val:.4f} among {len(tied)} "
|
|
422
|
+
f"prompt_configurations; using tie_breaker="
|
|
423
|
+
f"{self.config.tie_breaker.value!r} selected {chosen}. "
|
|
424
|
+
f"To change, set GEPAConfig.tie_breaker to one of: "
|
|
425
|
+
f"{[t.value for t in self.config.TieBreaker]} "
|
|
426
|
+
f"(tie_tolerance={float(self.config.tie_tolerance):g})."
|
|
427
|
+
)
|
|
428
|
+
self.status_callback(
|
|
429
|
+
RunnerStatusType.TIE,
|
|
430
|
+
detail=msg,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return self.prompt_configurations_by_id[chosen]
|
|
434
|
+
|
|
435
|
+
def _pick_prompt_configuration(self) -> PromptConfiguration:
|
|
436
|
+
selected_prompt_configuration_id = select_prompt_configuration_pareto(
|
|
437
|
+
self.pareto_score_table, random_state=self.random_state
|
|
438
|
+
)
|
|
439
|
+
return self.prompt_configurations_by_id[
|
|
440
|
+
selected_prompt_configuration_id
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
def _draw_minibatch(
|
|
444
|
+
self, d_feedback: Union[List["Golden"], List["ConversationalGolden"]]
|
|
445
|
+
) -> Union[List["Golden"], List["ConversationalGolden"]]:
|
|
446
|
+
# Determine effective minibatch size from GEPAConfig, bounded by the
|
|
447
|
+
# available feedback set.
|
|
448
|
+
n_feedback = len(d_feedback)
|
|
449
|
+
if n_feedback <= 0:
|
|
450
|
+
return []
|
|
451
|
+
|
|
452
|
+
if self.config.minibatch_size is not None:
|
|
453
|
+
size = self.config.minibatch_size
|
|
454
|
+
else:
|
|
455
|
+
# Dynamic sizing from ratio, bounded between min and max.
|
|
456
|
+
dynamic = max(
|
|
457
|
+
1, int(round(n_feedback * self.config.minibatch_ratio))
|
|
458
|
+
)
|
|
459
|
+
size = max(
|
|
460
|
+
self.config.minibatch_min_size,
|
|
461
|
+
min(dynamic, self.config.minibatch_max_size),
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
size = max(1, min(size, n_feedback))
|
|
465
|
+
|
|
466
|
+
return [
|
|
467
|
+
d_feedback[self.random_state.randrange(0, n_feedback)]
|
|
468
|
+
for _ in range(size)
|
|
469
|
+
]
|
|
470
|
+
|
|
471
|
+
async def _a_generate_child_prompt(
|
|
472
|
+
self,
|
|
473
|
+
selected_module_id: ModuleId,
|
|
474
|
+
parent_prompt_configuration: PromptConfiguration,
|
|
475
|
+
feedback_text: str,
|
|
476
|
+
) -> Optional[Prompt]:
|
|
477
|
+
old_prompt = parent_prompt_configuration.prompts.get(
|
|
478
|
+
selected_module_id, Prompt(text_template="")
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
new_prompt = await self._rewriter.a_rewrite(
|
|
482
|
+
model_callback=self.model_callback,
|
|
483
|
+
module_id=selected_module_id,
|
|
484
|
+
old_prompt=old_prompt,
|
|
485
|
+
feedback_text=feedback_text,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
if old_prompt.type != new_prompt.type or self._prompts_equivalent(
|
|
489
|
+
old_prompt, new_prompt
|
|
490
|
+
):
|
|
491
|
+
# don't accept if new prompt is the same as parent
|
|
492
|
+
# or if the type somehow changed
|
|
493
|
+
return None
|
|
494
|
+
return new_prompt
|
|
495
|
+
|
|
496
|
+
def _generate_child_prompt(
|
|
497
|
+
self,
|
|
498
|
+
selected_module_id: ModuleId,
|
|
499
|
+
parent_prompt_configuration: PromptConfiguration,
|
|
500
|
+
feedback_text: str,
|
|
501
|
+
) -> Optional[Prompt]:
|
|
502
|
+
old_prompt = parent_prompt_configuration.prompts.get(
|
|
503
|
+
selected_module_id, Prompt(text_template="")
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
new_prompt = self._rewriter.rewrite(
|
|
507
|
+
model_callback=self.model_callback,
|
|
508
|
+
module_id=selected_module_id,
|
|
509
|
+
old_prompt=old_prompt,
|
|
510
|
+
feedback_text=feedback_text,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
if old_prompt.type != new_prompt.type or self._prompts_equivalent(
|
|
514
|
+
old_prompt, new_prompt
|
|
515
|
+
):
|
|
516
|
+
# don't accept if new prompt is the same as parent
|
|
517
|
+
# or if the type somehow changed
|
|
518
|
+
return None
|
|
519
|
+
return new_prompt
|
|
520
|
+
|
|
521
|
+
def _make_child(
|
|
522
|
+
self,
|
|
523
|
+
selected_module_id: ModuleId,
|
|
524
|
+
parent_prompt_configuration: PromptConfiguration,
|
|
525
|
+
child_prompt: Prompt,
|
|
526
|
+
) -> PromptConfiguration:
|
|
527
|
+
child_prompt_configuration = PromptConfiguration.new(
|
|
528
|
+
prompts=dict(parent_prompt_configuration.prompts),
|
|
529
|
+
parent=parent_prompt_configuration.id,
|
|
530
|
+
)
|
|
531
|
+
child_prompt_configuration.prompts[selected_module_id] = child_prompt
|
|
532
|
+
return child_prompt_configuration
|
|
533
|
+
|
|
534
|
+
def _should_accept_child(
|
|
535
|
+
self, parent_score: float, child_score: float
|
|
536
|
+
) -> bool:
|
|
537
|
+
jitter = 1e-6
|
|
538
|
+
return child_score >= parent_score + max(self.config.min_delta, jitter)
|
|
539
|
+
|
|
540
|
+
def _accept_child(
|
|
541
|
+
self,
|
|
542
|
+
selected_module_id: ModuleId,
|
|
543
|
+
parent_prompt_configuration: PromptConfiguration,
|
|
544
|
+
child_prompt_configuration: PromptConfiguration,
|
|
545
|
+
d_pareto: Union[List["Golden"], List["ConversationalGolden"]],
|
|
546
|
+
parent_score: float,
|
|
547
|
+
child_score: float,
|
|
548
|
+
) -> AcceptedIterationDict:
|
|
549
|
+
self._add_prompt_configuration(child_prompt_configuration)
|
|
550
|
+
self.pareto_score_table[child_prompt_configuration.id] = (
|
|
551
|
+
self.scoring_adapter.score_on_pareto(
|
|
552
|
+
child_prompt_configuration, d_pareto
|
|
553
|
+
)
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
return AcceptedIterationDict(
|
|
557
|
+
parent=parent_prompt_configuration.id,
|
|
558
|
+
child=child_prompt_configuration.id,
|
|
559
|
+
module=selected_module_id,
|
|
560
|
+
before=parent_score,
|
|
561
|
+
after=child_score,
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
async def _a_accept_child(
|
|
565
|
+
self,
|
|
566
|
+
selected_module_id: ModuleId,
|
|
567
|
+
parent_prompt_configuration: PromptConfiguration,
|
|
568
|
+
child_prompt_configuration: PromptConfiguration,
|
|
569
|
+
d_pareto: Union[List["Golden"], List["ConversationalGolden"]],
|
|
570
|
+
parent_score: float,
|
|
571
|
+
child_score: float,
|
|
572
|
+
) -> AcceptedIterationDict:
|
|
573
|
+
self._add_prompt_configuration(child_prompt_configuration)
|
|
574
|
+
self.pareto_score_table[child_prompt_configuration.id] = (
|
|
575
|
+
await self.scoring_adapter.a_score_on_pareto(
|
|
576
|
+
child_prompt_configuration, d_pareto
|
|
577
|
+
)
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
return AcceptedIterationDict(
|
|
581
|
+
parent=parent_prompt_configuration.id,
|
|
582
|
+
child=child_prompt_configuration.id,
|
|
583
|
+
module=selected_module_id,
|
|
584
|
+
before=parent_score,
|
|
585
|
+
after=child_score,
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
def _update_progress(
|
|
589
|
+
self,
|
|
590
|
+
total_iterations: int,
|
|
591
|
+
iteration: int,
|
|
592
|
+
remaining_iterations: int,
|
|
593
|
+
elapsed: float,
|
|
594
|
+
):
|
|
595
|
+
if self.status_callback is not None:
|
|
596
|
+
detail = (
|
|
597
|
+
f"(iterations={total_iterations}) "
|
|
598
|
+
f"• iteration {iteration}/{total_iterations} "
|
|
599
|
+
f"• {elapsed:.2f}s • remaining={remaining_iterations}"
|
|
600
|
+
)
|
|
601
|
+
self.status_callback(
|
|
602
|
+
RunnerStatusType.PROGRESS,
|
|
603
|
+
step_index=iteration,
|
|
604
|
+
total_steps=total_iterations,
|
|
605
|
+
detail=detail,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
def _update_error(
|
|
609
|
+
self, total_iterations: int, iteration: int, exc: Exception
|
|
610
|
+
):
|
|
611
|
+
# Report a user facing error event
|
|
612
|
+
if self.status_callback is not None:
|
|
613
|
+
detail = (
|
|
614
|
+
f"(iterations={total_iterations}) "
|
|
615
|
+
f"• error {exc.__class__.__name__}: {exc} "
|
|
616
|
+
f"• halted at iteration {iteration}"
|
|
617
|
+
)
|
|
618
|
+
self.status_callback(
|
|
619
|
+
RunnerStatusType.ERROR,
|
|
620
|
+
step_index=iteration,
|
|
621
|
+
total_steps=total_iterations,
|
|
622
|
+
detail=detail,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
def _run_loop_iteration(
|
|
626
|
+
self,
|
|
627
|
+
gepa_iteration: Callable[[], bool],
|
|
628
|
+
) -> None:
|
|
629
|
+
total_iterations = self.config.iterations
|
|
630
|
+
remaining_iterations = total_iterations
|
|
631
|
+
iteration = 0
|
|
632
|
+
self._update_progress(
|
|
633
|
+
total_iterations, iteration, remaining_iterations, 0
|
|
634
|
+
)
|
|
635
|
+
while remaining_iterations > 0:
|
|
636
|
+
iteration += 1
|
|
637
|
+
start_time = time.perf_counter()
|
|
638
|
+
try:
|
|
639
|
+
ok = gepa_iteration()
|
|
640
|
+
except Exception as exc:
|
|
641
|
+
# Report a user facing error event and halt optimization.
|
|
642
|
+
self._update_error(total_iterations, iteration, exc)
|
|
643
|
+
break
|
|
644
|
+
elapsed = time.perf_counter() - start_time
|
|
645
|
+
if not ok:
|
|
646
|
+
break
|
|
647
|
+
remaining_iterations -= 1
|
|
648
|
+
self._update_progress(
|
|
649
|
+
total_iterations, iteration, remaining_iterations, elapsed
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
async def _a_run_loop_iteration(
|
|
653
|
+
self,
|
|
654
|
+
a_gepa_iteration: Callable[[], Awaitable[bool]],
|
|
655
|
+
) -> None:
|
|
656
|
+
total_iterations = self.config.iterations
|
|
657
|
+
remaining_iterations = total_iterations
|
|
658
|
+
iteration = 0
|
|
659
|
+
self._update_progress(
|
|
660
|
+
total_iterations, iteration, remaining_iterations, 0
|
|
661
|
+
)
|
|
662
|
+
while remaining_iterations > 0:
|
|
663
|
+
iteration += 1
|
|
664
|
+
start_time = time.perf_counter()
|
|
665
|
+
try:
|
|
666
|
+
ok = await a_gepa_iteration()
|
|
667
|
+
except Exception as exc:
|
|
668
|
+
# Report a user facing error event and halt optimization.
|
|
669
|
+
self._update_error(total_iterations, iteration, exc)
|
|
670
|
+
break
|
|
671
|
+
elapsed = time.perf_counter() - start_time
|
|
672
|
+
if not ok:
|
|
673
|
+
break
|
|
674
|
+
remaining_iterations -= 1
|
|
675
|
+
self._update_progress(
|
|
676
|
+
total_iterations, iteration, remaining_iterations, elapsed
|
|
677
|
+
)
|