opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -3
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +41 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +28 -20
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +96 -46
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +122 -37
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.0.dist-info/RECORD +50 -0
- opik_optimizer-0.9.1.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,9 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import random
|
5
|
-
from typing import Any, Callable, Dict, List,
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, cast, Type
|
6
6
|
|
7
|
-
import
|
7
|
+
import rapidfuzz.distance.Indel
|
8
8
|
import litellm
|
9
9
|
import numpy as np
|
10
10
|
import opik
|
@@ -23,6 +23,7 @@ from opik_optimizer import _throttle, task_evaluator
|
|
23
23
|
from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
|
24
24
|
from opik_optimizer.optimization_config import chat_prompt, mappers
|
25
25
|
from opik_optimizer.optimization_result import OptimizationResult
|
26
|
+
from opik_optimizer.optimizable_agent import OptimizableAgent
|
26
27
|
|
27
28
|
from .. import utils
|
28
29
|
from . import reporting
|
@@ -37,6 +38,7 @@ litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
|
|
37
38
|
|
38
39
|
creator = cast(Any, _creator) # type: ignore[assignment]
|
39
40
|
|
41
|
+
|
40
42
|
class EvolutionaryOptimizer(BaseOptimizer):
|
41
43
|
"""
|
42
44
|
The Evolutionary Optimizer can be used to optimize prompts using a 4 stage genetic algorithm
|
@@ -47,10 +49,10 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
47
49
|
2. Evaluate the candidate prompts
|
48
50
|
3. Select the best prompts
|
49
51
|
4. Repeat until convergence
|
50
|
-
|
52
|
+
|
51
53
|
This algorithm is best used if you have a first draft prompt and would like to find a better
|
52
54
|
prompt.
|
53
|
-
|
55
|
+
|
54
56
|
Note: This algorithm is time consuming and can be expensive to run.
|
55
57
|
"""
|
56
58
|
|
@@ -65,15 +67,17 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
65
67
|
DEFAULT_MIN_MUTATION_RATE = 0.1
|
66
68
|
DEFAULT_MAX_MUTATION_RATE = 0.4
|
67
69
|
DEFAULT_ADAPTIVE_MUTATION = True
|
68
|
-
DEFAULT_DIVERSITY_THRESHOLD = 0.7
|
69
|
-
DEFAULT_RESTART_THRESHOLD = 0.01
|
70
|
-
DEFAULT_RESTART_GENERATIONS = 3
|
71
|
-
DEFAULT_CACHE_SIZE = 1000
|
70
|
+
DEFAULT_DIVERSITY_THRESHOLD = 0.7
|
71
|
+
DEFAULT_RESTART_THRESHOLD = 0.01
|
72
|
+
DEFAULT_RESTART_GENERATIONS = 3
|
73
|
+
DEFAULT_CACHE_SIZE = 1000
|
72
74
|
DEFAULT_EARLY_STOPPING_GENERATIONS = 5
|
73
|
-
DEFAULT_ENABLE_MOO = True
|
74
|
-
DEFAULT_ENABLE_LLM_CROSSOVER = True
|
75
|
+
DEFAULT_ENABLE_MOO = True
|
76
|
+
DEFAULT_ENABLE_LLM_CROSSOVER = True
|
75
77
|
DEFAULT_SEED = 42
|
76
|
-
DEFAULT_OUTPUT_STYLE_GUIDANCE =
|
78
|
+
DEFAULT_OUTPUT_STYLE_GUIDANCE = (
|
79
|
+
"Produce clear, effective, and high-quality responses suitable for the task."
|
80
|
+
)
|
77
81
|
DEFAULT_MOO_WEIGHTS = (1.0, -1.0) # (Maximize Score, Minimize Length)
|
78
82
|
|
79
83
|
_INFER_STYLE_SYSTEM_PROMPT = """You are an expert in linguistic analysis and prompt engineering. Your task is to analyze a few input-output examples from a dataset and provide a concise, actionable description of the desired output style. This description will be used to guide other LLMs in generating and refining prompts.
|
@@ -85,7 +89,7 @@ Focus on characteristics like:
|
|
85
89
|
- **Content Details**: (e.g., includes only the answer, includes reasoning, provides examples, avoids pleasantries)
|
86
90
|
- **Keywords/Phrasing**: Any recurring keywords or phrasing patterns in the outputs.
|
87
91
|
|
88
|
-
Provide a single string that summarizes this style. This summary should be directly usable as an instruction for another LLM.
|
92
|
+
Provide a single string that summarizes this style. This summary should be directly usable as an instruction for another LLM.
|
89
93
|
For example: 'Outputs should be a single, concise proper noun.' OR 'Outputs should be a short paragraph explaining the reasoning, followed by a direct answer, avoiding conversational pleasantries.' OR 'Outputs are typically 1-2 sentences, providing a direct factual answer.'
|
90
94
|
Return ONLY this descriptive string, with no preamble or extra formatting.
|
91
95
|
"""
|
@@ -93,7 +97,6 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
93
97
|
def __init__(
|
94
98
|
self,
|
95
99
|
model: str,
|
96
|
-
project_name: str = "Optimization",
|
97
100
|
population_size: int = DEFAULT_POPULATION_SIZE,
|
98
101
|
num_generations: int = DEFAULT_NUM_GENERATIONS,
|
99
102
|
mutation_rate: float = DEFAULT_MUTATION_RATE,
|
@@ -108,12 +111,11 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
108
111
|
output_style_guidance: Optional[str] = None,
|
109
112
|
infer_output_style: bool = False,
|
110
113
|
verbose: int = 1,
|
111
|
-
**model_kwargs,
|
112
|
-
):
|
114
|
+
**model_kwargs: Any,
|
115
|
+
) -> None:
|
113
116
|
"""
|
114
117
|
Args:
|
115
118
|
model: The model to use for evaluation
|
116
|
-
project_name: Optional project name for tracking
|
117
119
|
population_size: Number of prompts in the population
|
118
120
|
num_generations: Number of generations to run
|
119
121
|
mutation_rate: Mutation rate for genetic operations
|
@@ -131,7 +133,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
131
133
|
**model_kwargs: Additional model parameters
|
132
134
|
"""
|
133
135
|
# Initialize base class first
|
134
|
-
super().__init__(model=model,
|
136
|
+
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
135
137
|
self.population_size = population_size
|
136
138
|
self.num_generations = num_generations
|
137
139
|
self.mutation_rate = mutation_rate
|
@@ -143,20 +145,22 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
143
145
|
self.enable_moo = enable_moo
|
144
146
|
self.enable_llm_crossover = enable_llm_crossover
|
145
147
|
self.seed = seed
|
146
|
-
self.output_style_guidance =
|
148
|
+
self.output_style_guidance = (
|
149
|
+
output_style_guidance
|
150
|
+
if output_style_guidance is not None
|
151
|
+
else self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
152
|
+
)
|
147
153
|
self.infer_output_style = infer_output_style
|
148
154
|
self.llm_call_counter = 0
|
149
155
|
self._opik_client = opik_client.get_client_cached()
|
150
|
-
self._current_optimization_id = None
|
156
|
+
self._current_optimization_id: Optional[str] = None
|
151
157
|
self._current_generation = 0
|
152
|
-
self._best_fitness_history = []
|
158
|
+
self._best_fitness_history: List[float] = []
|
153
159
|
self._generations_without_improvement = 0
|
154
|
-
self.
|
155
|
-
self._current_population = []
|
160
|
+
self._current_population: List[Any] = []
|
156
161
|
self._generations_without_overall_improvement = 0
|
157
|
-
self._best_primary_score_history:
|
162
|
+
self._best_primary_score_history: List[float] = []
|
158
163
|
self._gens_since_pop_improvement: int = 0
|
159
|
-
self.verbose = verbose
|
160
164
|
|
161
165
|
if self.seed is not None:
|
162
166
|
random.seed(self.seed)
|
@@ -167,33 +171,44 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
167
171
|
|
168
172
|
if self.enable_moo:
|
169
173
|
if not hasattr(creator, "FitnessMulti"):
|
170
|
-
creator.create(
|
174
|
+
creator.create(
|
175
|
+
"FitnessMulti", base.Fitness, weights=self.DEFAULT_MOO_WEIGHTS
|
176
|
+
)
|
171
177
|
fitness_attr = creator.FitnessMulti
|
172
178
|
else:
|
173
179
|
if not hasattr(creator, "FitnessMax"):
|
174
180
|
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
|
175
181
|
fitness_attr = creator.FitnessMax
|
176
|
-
|
177
|
-
if
|
182
|
+
|
183
|
+
if (
|
184
|
+
not hasattr(creator, "Individual")
|
185
|
+
or getattr(creator.Individual, "fitness") != fitness_attr
|
186
|
+
):
|
178
187
|
if hasattr(creator, "Individual"):
|
179
188
|
del creator.Individual
|
180
189
|
creator.create("Individual", list, fitness=fitness_attr)
|
181
190
|
|
182
191
|
self.toolbox = base.Toolbox()
|
183
|
-
self.toolbox.register(
|
184
|
-
|
185
|
-
|
192
|
+
self.toolbox.register(
|
193
|
+
"default_individual", lambda: creator.Individual("placeholder")
|
194
|
+
)
|
195
|
+
self.toolbox.register(
|
196
|
+
"population", tools.initRepeat, list, self.toolbox.default_individual
|
197
|
+
)
|
198
|
+
|
186
199
|
if self.enable_llm_crossover:
|
187
200
|
self.toolbox.register("mate", self._llm_deap_crossover)
|
188
201
|
else:
|
189
202
|
self.toolbox.register("mate", self._deap_crossover)
|
190
|
-
|
203
|
+
|
191
204
|
self.toolbox.register("mutate", self._deap_mutation)
|
192
|
-
|
205
|
+
|
193
206
|
if self.enable_moo:
|
194
207
|
self.toolbox.register("select", tools.selNSGA2)
|
195
208
|
else:
|
196
|
-
self.toolbox.register(
|
209
|
+
self.toolbox.register(
|
210
|
+
"select", tools.selTournament, tournsize=self.tournament_size
|
211
|
+
)
|
197
212
|
|
198
213
|
logger.debug(
|
199
214
|
f"Initialized EvolutionaryOptimizer with model: {model}, MOO_enabled: {self.enable_moo}, "
|
@@ -209,22 +224,27 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
209
224
|
return self.mutation_rate
|
210
225
|
|
211
226
|
# Calculate improvement rate
|
212
|
-
recent_improvement = (
|
213
|
-
|
227
|
+
recent_improvement = (
|
228
|
+
self._best_fitness_history[-1] - self._best_fitness_history[-2]
|
229
|
+
) / abs(self._best_fitness_history[-2])
|
230
|
+
|
214
231
|
# Calculate population diversity
|
215
232
|
current_diversity = self._calculate_population_diversity()
|
216
|
-
|
233
|
+
|
217
234
|
# Check for stagnation
|
218
235
|
if recent_improvement < self.DEFAULT_RESTART_THRESHOLD:
|
219
236
|
self._generations_without_improvement += 1
|
220
237
|
else:
|
221
238
|
self._generations_without_improvement = 0
|
222
|
-
|
239
|
+
|
223
240
|
# Adjust mutation rate based on both improvement and diversity
|
224
241
|
if self._generations_without_improvement >= self.DEFAULT_RESTART_GENERATIONS:
|
225
242
|
# Significant stagnation - increase mutation significantly
|
226
243
|
return min(self.mutation_rate * 2.5, self.DEFAULT_MAX_MUTATION_RATE)
|
227
|
-
elif
|
244
|
+
elif (
|
245
|
+
recent_improvement < 0.01
|
246
|
+
and current_diversity < self.DEFAULT_DIVERSITY_THRESHOLD
|
247
|
+
):
|
228
248
|
# Both stagnating and low diversity - increase mutation significantly
|
229
249
|
return min(self.mutation_rate * 2.0, self.DEFAULT_MAX_MUTATION_RATE)
|
230
250
|
elif recent_improvement < 0.01:
|
@@ -237,29 +257,34 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
237
257
|
|
238
258
|
def _calculate_population_diversity(self) -> float:
|
239
259
|
"""Calculate the diversity of the current population."""
|
240
|
-
if not hasattr(self,
|
260
|
+
if not hasattr(self, "_current_population") or not self._current_population:
|
241
261
|
return 0.0
|
242
|
-
|
243
|
-
# Calculate average Levenshtein distance between all pairs
|
262
|
+
|
263
|
+
# Calculate average Levenshtein using rapidfuzz distance between all pairs
|
244
264
|
total_distance = 0.0
|
245
265
|
count = 0
|
246
266
|
for i in range(len(self._current_population)):
|
247
267
|
for j in range(i + 1, len(self._current_population)):
|
248
268
|
str1 = str(self._current_population[i])
|
249
269
|
str2 = str(self._current_population[j])
|
250
|
-
distance =
|
270
|
+
distance = rapidfuzz.distance.Indel.normalized_similarity(str1, str2)
|
251
271
|
max_len = max(len(str1), len(str2))
|
252
272
|
if max_len > 0:
|
253
273
|
normalized_distance = distance / max_len
|
254
274
|
total_distance += normalized_distance
|
255
275
|
count += 1
|
256
|
-
|
257
|
-
return total_distance / count if count > 0 else 0.0
|
258
276
|
|
277
|
+
return total_distance / count if count > 0 else 0.0
|
259
278
|
|
260
|
-
def _deap_crossover_chunking_strategy(
|
261
|
-
|
262
|
-
|
279
|
+
def _deap_crossover_chunking_strategy(
|
280
|
+
self, messages_1_str: str, messages_2_str: str
|
281
|
+
) -> Tuple[str, str]:
|
282
|
+
chunks1 = [
|
283
|
+
chunk.strip() for chunk in messages_1_str.split(".") if chunk.strip()
|
284
|
+
]
|
285
|
+
chunks2 = [
|
286
|
+
chunk.strip() for chunk in messages_2_str.split(".") if chunk.strip()
|
287
|
+
]
|
263
288
|
|
264
289
|
# Try chunk-level crossover if both parents have at least 2 chunks
|
265
290
|
if len(chunks1) >= 2 and len(chunks2) >= 2:
|
@@ -267,18 +292,22 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
267
292
|
# Crossover point is between 1 and min_num_chunks - 1
|
268
293
|
# This requires min_num_chunks >= 2, which is already checked.
|
269
294
|
point = random.randint(1, min_num_chunks - 1)
|
270
|
-
|
295
|
+
|
271
296
|
child1_chunks = chunks1[:point] + chunks2[point:]
|
272
297
|
child2_chunks = chunks2[:point] + chunks1[point:]
|
273
|
-
|
274
|
-
child1_str =
|
275
|
-
child2_str =
|
276
|
-
|
298
|
+
|
299
|
+
child1_str = ". ".join(child1_chunks) + ("." if child1_chunks else "")
|
300
|
+
child2_str = ". ".join(child2_chunks) + ("." if child2_chunks else "")
|
301
|
+
|
277
302
|
return child1_str, child2_str
|
278
303
|
else:
|
279
|
-
raise ValueError(
|
280
|
-
|
281
|
-
|
304
|
+
raise ValueError(
|
305
|
+
"Not enough chunks in either prompt for chunk-level crossover"
|
306
|
+
)
|
307
|
+
|
308
|
+
def _deap_crossover_word_level(
|
309
|
+
self, messages_1_str: str, messages_2_str: str
|
310
|
+
) -> Tuple[str, str]:
|
282
311
|
words1 = messages_1_str.split()
|
283
312
|
words2 = messages_2_str.split()
|
284
313
|
|
@@ -295,53 +324,54 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
295
324
|
point = random.randint(1, min_word_len - 1)
|
296
325
|
child1_words = words1[:point] + words2[point:]
|
297
326
|
child2_words = words2[:point] + words1[point:]
|
298
|
-
|
299
|
-
return
|
300
|
-
|
301
|
-
def _deap_crossover(
|
302
|
-
self,
|
303
|
-
ind1: "creator.Individual",
|
304
|
-
ind2: "creator.Individual"
|
305
|
-
) -> Tuple["creator.Individual", "creator.Individual"]:
|
327
|
+
|
328
|
+
return " ".join(child1_words), " ".join(child2_words)
|
329
|
+
|
330
|
+
def _deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
|
306
331
|
"""Enhanced crossover operation that preserves semantic meaning.
|
307
332
|
Attempts chunk-level crossover first, then falls back to word-level.
|
308
333
|
"""
|
309
|
-
reporting.display_message(
|
310
|
-
|
311
|
-
|
334
|
+
reporting.display_message(
|
335
|
+
" Recombining prompts by mixing and matching words and sentences.",
|
336
|
+
verbose=self.verbose,
|
337
|
+
)
|
338
|
+
messages_1_orig: List[Dict[str, str]] = ind1
|
339
|
+
messages_2_orig: List[Dict[str, str]] = ind2
|
312
340
|
|
313
341
|
for i, message_1 in enumerate(messages_1_orig):
|
314
|
-
role: str = message_1[
|
315
|
-
message_1_str: str = message_1[
|
342
|
+
role: str = message_1["role"]
|
343
|
+
message_1_str: str = message_1["content"]
|
316
344
|
|
317
345
|
# We check that the second message has enough AI messages and the correct role
|
318
|
-
if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i][
|
346
|
+
if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]["role"] == role):
|
319
347
|
message_2 = messages_2_orig[i]
|
320
|
-
message_2_str: str = message_2[
|
348
|
+
message_2_str: str = message_2["content"]
|
321
349
|
|
322
350
|
try:
|
323
|
-
child1_str, child2_str = self._deap_crossover_chunking_strategy(
|
351
|
+
child1_str, child2_str = self._deap_crossover_chunking_strategy(
|
352
|
+
message_1_str, message_2_str
|
353
|
+
)
|
324
354
|
except ValueError:
|
325
|
-
child1_str, child2_str = self._deap_crossover_word_level(
|
326
|
-
|
355
|
+
child1_str, child2_str = self._deap_crossover_word_level(
|
356
|
+
message_1_str, message_2_str
|
357
|
+
)
|
358
|
+
|
327
359
|
# Update the message content
|
328
|
-
messages_1_orig[i][
|
329
|
-
messages_2_orig[i][
|
360
|
+
messages_1_orig[i]["content"] = child1_str
|
361
|
+
messages_2_orig[i]["content"] = child2_str
|
330
362
|
else:
|
331
363
|
# We don't perform any crossover if there are not enough messages or the roles
|
332
364
|
# don't match
|
333
365
|
pass
|
334
|
-
|
366
|
+
|
335
367
|
return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
|
336
368
|
|
337
369
|
def _deap_mutation(
|
338
|
-
|
339
|
-
|
340
|
-
initial_prompt: chat_prompt.ChatPrompt
|
341
|
-
) -> "creator.Individual":
|
370
|
+
self, individual: Any, initial_prompt: chat_prompt.ChatPrompt
|
371
|
+
) -> Any:
|
342
372
|
"""Enhanced mutation operation with multiple strategies."""
|
343
373
|
prompt = chat_prompt.ChatPrompt(messages=individual)
|
344
|
-
|
374
|
+
|
345
375
|
# Choose mutation strategy based on current diversity
|
346
376
|
diversity = self._calculate_population_diversity()
|
347
377
|
|
@@ -349,54 +379,68 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
349
379
|
if diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
|
350
380
|
# Low diversity - use more aggressive mutations (higher chance for semantic)
|
351
381
|
semantic_threshold = 0.5
|
352
|
-
structural_threshold = 0.8
|
382
|
+
structural_threshold = 0.8 # semantic_threshold + 0.3
|
353
383
|
else:
|
354
384
|
# Good diversity - use more conservative mutations (higher chance for word_level)
|
355
385
|
semantic_threshold = 0.4
|
356
|
-
structural_threshold = 0.7
|
386
|
+
structural_threshold = 0.7 # semantic_threshold + 0.3
|
357
387
|
|
358
388
|
mutation_choice = random.random()
|
359
389
|
|
360
390
|
if mutation_choice > structural_threshold:
|
361
391
|
# This corresponds to the original 'else' (word_level_mutation)
|
362
392
|
mutated_prompt = self._word_level_mutation_prompt(prompt)
|
363
|
-
reporting.display_success(
|
364
|
-
|
393
|
+
reporting.display_success(
|
394
|
+
" Mutation successful, prompt has been edited by randomizing words (word-level mutation).",
|
395
|
+
verbose=self.verbose,
|
396
|
+
)
|
397
|
+
return creator.Individual(mutated_prompt.get_messages())
|
365
398
|
elif mutation_choice > semantic_threshold:
|
366
399
|
# This corresponds to the original 'elif' (structural_mutation)
|
367
400
|
mutated_prompt = self._structural_mutation(prompt)
|
368
|
-
reporting.display_success(
|
369
|
-
|
401
|
+
reporting.display_success(
|
402
|
+
" Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).",
|
403
|
+
verbose=self.verbose,
|
404
|
+
)
|
405
|
+
return creator.Individual(mutated_prompt.get_messages())
|
370
406
|
else:
|
371
407
|
# This corresponds to the original 'if' (semantic_mutation)
|
372
408
|
mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
|
373
|
-
reporting.display_success(
|
374
|
-
|
409
|
+
reporting.display_success(
|
410
|
+
" Mutation successful, prompt has been edited using an LLM (semantic mutation).",
|
411
|
+
verbose=self.verbose,
|
412
|
+
)
|
413
|
+
return creator.Individual(mutated_prompt.get_messages())
|
375
414
|
|
376
415
|
def _semantic_mutation(
|
377
|
-
|
378
|
-
|
379
|
-
initial_prompt: chat_prompt.ChatPrompt
|
380
|
-
) -> chat_prompt.ChatPrompt:
|
416
|
+
self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
|
417
|
+
) -> chat_prompt.ChatPrompt:
|
381
418
|
"""Enhanced semantic mutation with multiple strategies."""
|
382
419
|
current_output_style_guidance = self.output_style_guidance
|
383
|
-
if random.random() < 0.1:
|
420
|
+
if random.random() < 0.1:
|
384
421
|
return self._radical_innovation_mutation(prompt, initial_prompt)
|
385
|
-
|
422
|
+
|
386
423
|
try:
|
387
|
-
strategy = random.choice(
|
388
|
-
|
389
|
-
|
390
|
-
|
424
|
+
strategy = random.choice(
|
425
|
+
[
|
426
|
+
"rephrase",
|
427
|
+
"simplify",
|
428
|
+
"elaborate",
|
429
|
+
"restructure",
|
430
|
+
"focus",
|
431
|
+
"increase_complexity_and_detail",
|
432
|
+
]
|
433
|
+
)
|
434
|
+
|
391
435
|
strategy_prompts = {
|
392
436
|
"rephrase": f"Create a different way to express the same instruction, possibly with a different length or structure, ensuring it still aims for an answer from the target LLM in the style of: '{current_output_style_guidance}'.",
|
393
437
|
"simplify": f"Simplify the instruction while maintaining its core meaning, potentially making it more concise, to elicit an answer in the style of: '{current_output_style_guidance}'.",
|
394
438
|
"elaborate": f"Add more relevant detail and specificity to the instruction, potentially increasing its length, but only if it helps achieve a more accurate answer from the target LLM in the style of: '{current_output_style_guidance}'.",
|
395
439
|
"restructure": f"Change the structure of the instruction (e.g., reorder sentences, combine/split ideas) while keeping its intent, ensuring the new structure strongly guides towards an output in the style of: '{current_output_style_guidance}'.",
|
396
440
|
"focus": f"Emphasize the key aspects of the instruction, perhaps by rephrasing or adding clarifying statements, to better elicit an answer in the style of: '{current_output_style_guidance}'.",
|
397
|
-
"increase_complexity_and_detail": f"Significantly elaborate on this instruction. Add more details, examples, context, or constraints to make it more comprehensive. The goal of this elaboration is to make the prompt itself more detailed, so that it VERY CLEARLY guides the target LLM to produce a highly accurate final answer in the style of: '{current_output_style_guidance}'. The prompt can be long if needed to achieve this output style."
|
441
|
+
"increase_complexity_and_detail": f"Significantly elaborate on this instruction. Add more details, examples, context, or constraints to make it more comprehensive. The goal of this elaboration is to make the prompt itself more detailed, so that it VERY CLEARLY guides the target LLM to produce a highly accurate final answer in the style of: '{current_output_style_guidance}'. The prompt can be long if needed to achieve this output style.",
|
398
442
|
}
|
399
|
-
|
443
|
+
|
400
444
|
user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
|
401
445
|
Task context: {self._get_task_description_for_llm(initial_prompt)}
|
402
446
|
Desired output style from target LLM: '{current_output_style_guidance}'
|
@@ -405,46 +449,57 @@ Return only the modified prompt message list, nothing else. Make sure to return
|
|
405
449
|
"""
|
406
450
|
response = self._call_model(
|
407
451
|
messages=[
|
408
|
-
{
|
409
|
-
|
452
|
+
{
|
453
|
+
"role": "system",
|
454
|
+
"content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided.",
|
455
|
+
},
|
456
|
+
{"role": "user", "content": user_prompt_for_semantic_mutation},
|
410
457
|
],
|
411
|
-
is_reasoning=True
|
458
|
+
is_reasoning=True,
|
412
459
|
)
|
413
460
|
|
414
461
|
return chat_prompt.ChatPrompt(messages=utils.json_to_dict(response.strip()))
|
415
462
|
except Exception as e:
|
416
|
-
reporting.display_error(
|
463
|
+
reporting.display_error(
|
464
|
+
f" Error in semantic mutation, this is usually a parsing error: {e}",
|
465
|
+
verbose=self.verbose,
|
466
|
+
)
|
417
467
|
return prompt
|
418
468
|
|
419
469
|
def _structural_mutation(
|
420
|
-
|
421
|
-
|
422
|
-
) -> chat_prompt.ChatPrompt:
|
470
|
+
self, prompt: chat_prompt.ChatPrompt
|
471
|
+
) -> chat_prompt.ChatPrompt:
|
423
472
|
"""Perform structural mutation (reordering, combining, splitting)."""
|
424
|
-
mutated_messages: List[Dict[
|
473
|
+
mutated_messages: List[Dict[str, str]] = []
|
425
474
|
|
426
|
-
for message in prompt.
|
475
|
+
for message in prompt.get_messages():
|
427
476
|
content = message["content"]
|
428
477
|
role = message["role"]
|
429
478
|
|
430
|
-
sentences = [s.strip() for s in content.split(
|
479
|
+
sentences = [s.strip() for s in content.split(".") if s.strip()]
|
431
480
|
if len(sentences) <= 1:
|
432
|
-
mutated_messages.append(
|
481
|
+
mutated_messages.append(
|
482
|
+
{"role": role, "content": self._word_level_mutation(content)}
|
483
|
+
)
|
433
484
|
continue
|
434
|
-
|
485
|
+
|
435
486
|
mutation_type = random.random()
|
436
487
|
if mutation_type < 0.3:
|
437
488
|
# Reorder sentences
|
438
489
|
random.shuffle(sentences)
|
439
|
-
mutated_messages.append(
|
490
|
+
mutated_messages.append(
|
491
|
+
{"role": role, "content": ". ".join(sentences) + "."}
|
492
|
+
)
|
440
493
|
continue
|
441
494
|
elif mutation_type < 0.6:
|
442
495
|
# Combine adjacent sentences
|
443
496
|
if len(sentences) >= 2:
|
444
497
|
idx = random.randint(0, len(sentences) - 2)
|
445
|
-
combined = sentences[idx] +
|
446
|
-
sentences[idx:idx+2] = [combined]
|
447
|
-
mutated_messages.append(
|
498
|
+
combined = sentences[idx] + " and " + sentences[idx + 1]
|
499
|
+
sentences[idx : idx + 2] = [combined]
|
500
|
+
mutated_messages.append(
|
501
|
+
{"role": role, "content": ". ".join(sentences) + "."}
|
502
|
+
)
|
448
503
|
continue
|
449
504
|
else:
|
450
505
|
# Split a sentence
|
@@ -452,33 +507,45 @@ Return only the modified prompt message list, nothing else. Make sure to return
|
|
452
507
|
words = sentences[idx].split()
|
453
508
|
if len(words) > 3:
|
454
509
|
split_point = random.randint(2, len(words) - 2)
|
455
|
-
sentences[idx:idx+1] = [
|
456
|
-
|
510
|
+
sentences[idx : idx + 1] = [
|
511
|
+
" ".join(words[:split_point]),
|
512
|
+
" ".join(words[split_point:]),
|
513
|
+
]
|
514
|
+
mutated_messages.append(
|
515
|
+
{"role": role, "content": ". ".join(sentences) + "."}
|
516
|
+
)
|
457
517
|
continue
|
458
518
|
else:
|
459
519
|
mutated_messages.append({"role": role, "content": content})
|
460
520
|
|
461
521
|
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
462
522
|
|
463
|
-
def _word_level_mutation_prompt(
|
464
|
-
|
465
|
-
|
466
|
-
|
523
|
+
def _word_level_mutation_prompt(
|
524
|
+
self, prompt: chat_prompt.ChatPrompt
|
525
|
+
) -> chat_prompt.ChatPrompt:
|
526
|
+
mutated_messages: List[Dict[str, str]] = []
|
527
|
+
for message in prompt.get_messages():
|
528
|
+
mutated_messages.append(
|
529
|
+
{
|
530
|
+
"role": message["role"],
|
531
|
+
"content": self._word_level_mutation(message["content"]),
|
532
|
+
}
|
533
|
+
)
|
467
534
|
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
468
|
-
|
535
|
+
|
469
536
|
def _word_level_mutation(self, msg_content: str) -> str:
|
470
537
|
"""Perform word-level mutation."""
|
471
538
|
words = msg_content.split()
|
472
539
|
if len(words) <= 1:
|
473
540
|
return msg_content
|
474
|
-
|
541
|
+
|
475
542
|
mutation_type = random.random()
|
476
|
-
if mutation_type < 0.3:
|
543
|
+
if mutation_type < 0.3:
|
477
544
|
# Word replacement
|
478
545
|
idx = random.randint(0, len(words) - 1)
|
479
546
|
words[idx] = self._get_synonym(words[idx])
|
480
547
|
elif mutation_type < 0.6:
|
481
|
-
|
548
|
+
# Word reordering
|
482
549
|
if len(words) > 2:
|
483
550
|
i, j = random.sample(range(len(words)), 2)
|
484
551
|
words[i], words[j] = words[j], words[i]
|
@@ -486,39 +553,45 @@ Return only the modified prompt message list, nothing else. Make sure to return
|
|
486
553
|
# Phrase modification
|
487
554
|
idx = random.randint(0, len(words) - 1)
|
488
555
|
words[idx] = self._modify_phrase(words[idx])
|
489
|
-
|
490
|
-
return ' '.join(words)
|
491
556
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
) -> str:
|
557
|
+
return " ".join(words)
|
558
|
+
|
559
|
+
def _get_synonym(self, word: str) -> str:
|
496
560
|
"""Get a synonym for a word using LLM."""
|
497
561
|
try:
|
498
562
|
response = self._call_model(
|
499
563
|
messages=[
|
500
|
-
{
|
501
|
-
|
564
|
+
{
|
565
|
+
"role": "system",
|
566
|
+
"content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text.",
|
567
|
+
},
|
568
|
+
{
|
569
|
+
"role": "user",
|
570
|
+
"content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else.",
|
571
|
+
},
|
502
572
|
],
|
503
|
-
is_reasoning=True
|
573
|
+
is_reasoning=True,
|
504
574
|
)
|
505
575
|
return response.strip()
|
506
576
|
except Exception as e:
|
507
577
|
logger.warning(f"Error getting synonym for '{word}': {e}")
|
508
578
|
return word
|
509
579
|
|
510
|
-
def _modify_phrase(
|
511
|
-
self,
|
512
|
-
phrase: str
|
513
|
-
) -> str:
|
580
|
+
def _modify_phrase(self, phrase: str) -> str:
|
514
581
|
"""Modify a phrase while preserving meaning using LLM."""
|
515
582
|
try:
|
516
583
|
response = self._call_model(
|
517
584
|
messages=[
|
518
|
-
{
|
519
|
-
|
585
|
+
{
|
586
|
+
"role": "system",
|
587
|
+
"content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text.",
|
588
|
+
},
|
589
|
+
{
|
590
|
+
"role": "user",
|
591
|
+
"content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else.",
|
592
|
+
},
|
520
593
|
],
|
521
|
-
is_reasoning=True
|
594
|
+
is_reasoning=True,
|
522
595
|
)
|
523
596
|
return response.strip()
|
524
597
|
except Exception as e:
|
@@ -526,47 +599,53 @@ Return only the modified prompt message list, nothing else. Make sure to return
|
|
526
599
|
return phrase
|
527
600
|
|
528
601
|
def _radical_innovation_mutation(
|
529
|
-
|
530
|
-
|
531
|
-
initial_prompt: chat_prompt.ChatPrompt
|
532
|
-
) -> chat_prompt.ChatPrompt:
|
602
|
+
self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
|
603
|
+
) -> chat_prompt.ChatPrompt:
|
533
604
|
"""Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
|
534
|
-
logger.debug(
|
605
|
+
logger.debug(
|
606
|
+
f"Attempting radical innovation for prompt: {json.dumps(prompt.get_messages())[:70]}..."
|
607
|
+
)
|
535
608
|
task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
|
536
609
|
current_output_style_guidance = self.output_style_guidance
|
537
|
-
|
610
|
+
|
538
611
|
user_prompt_for_radical_innovation = f"""Task Context:
|
539
612
|
{task_desc_for_llm}
|
540
613
|
Desired output style from target LLM: '{current_output_style_guidance}'
|
541
614
|
|
542
615
|
Existing Prompt (which may be underperforming):
|
543
|
-
'''{prompt.
|
616
|
+
'''{prompt.get_messages()}'''
|
544
617
|
|
545
|
-
Please generate a new, significantly improved, and potentially very different prompt for this task.
|
618
|
+
Please generate a new, significantly improved, and potentially very different prompt for this task.
|
546
619
|
Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
|
547
620
|
Return only the new prompt list object.
|
548
621
|
"""
|
549
622
|
try:
|
550
623
|
new_prompt_str = self._call_model(
|
551
624
|
messages=[
|
552
|
-
{
|
553
|
-
|
625
|
+
{
|
626
|
+
"role": "system",
|
627
|
+
"content": self._get_radical_innovation_system_prompt(),
|
628
|
+
},
|
629
|
+
{"role": "user", "content": user_prompt_for_radical_innovation},
|
554
630
|
],
|
555
|
-
is_reasoning=True
|
631
|
+
is_reasoning=True,
|
632
|
+
)
|
633
|
+
logger.info(
|
634
|
+
f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.get_messages())[:70]}..."
|
556
635
|
)
|
557
|
-
logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.formatted_messages)[:70]}...")
|
558
636
|
return chat_prompt.ChatPrompt(messages=json.loads(new_prompt_str))
|
559
637
|
except Exception as e:
|
560
|
-
logger.warning(
|
638
|
+
logger.warning(
|
639
|
+
f"Radical innovation mutation failed for prompt '{json.dumps(prompt.get_messages())[:50]}...': {e}. Returning original."
|
640
|
+
)
|
561
641
|
return prompt
|
562
642
|
|
563
643
|
def _initialize_population(
|
564
|
-
self,
|
565
|
-
prompt: chat_prompt.ChatPrompt
|
644
|
+
self, prompt: chat_prompt.ChatPrompt
|
566
645
|
) -> List[chat_prompt.ChatPrompt]:
|
567
|
-
"""Initialize the population with diverse variations of the initial prompt,
|
568
|
-
|
569
|
-
|
646
|
+
"""Initialize the population with diverse variations of the initial prompt,
|
647
|
+
including some 'fresh start' prompts based purely on task description.
|
648
|
+
All generated prompts should aim to elicit answers matching self.output_style_guidance.
|
570
649
|
"""
|
571
650
|
with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
|
572
651
|
init_pop_report.start(self.population_size)
|
@@ -593,59 +672,75 @@ Return only the new prompt list object.
|
|
593
672
|
|
594
673
|
Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
|
595
674
|
Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
|
596
|
-
|
675
|
+
|
597
676
|
Example of valid response: [
|
598
677
|
["role": "<role>", "content": "<Prompt targeting specified style.>"],
|
599
678
|
["role": "<role>", "content": "<Another prompt designed for the output style.>"]
|
600
679
|
]
|
601
680
|
|
602
681
|
Your response MUST be a valid JSON list of AI messages. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
|
603
|
-
|
682
|
+
|
604
683
|
"""
|
605
684
|
try:
|
606
685
|
response_content = self._call_model(
|
607
686
|
messages=[
|
608
|
-
{
|
609
|
-
|
687
|
+
{
|
688
|
+
"role": "system",
|
689
|
+
"content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
|
690
|
+
},
|
691
|
+
{"role": "user", "content": fresh_start_user_prompt},
|
610
692
|
],
|
611
|
-
is_reasoning=True
|
693
|
+
is_reasoning=True,
|
612
694
|
)
|
613
|
-
|
614
|
-
logger.debug(
|
615
|
-
|
695
|
+
|
696
|
+
logger.debug(
|
697
|
+
f"Raw LLM response for fresh start prompts: {response_content}"
|
698
|
+
)
|
699
|
+
|
616
700
|
fresh_prompts = utils.json_to_dict(response_content)
|
617
701
|
if isinstance(fresh_prompts, list):
|
618
|
-
if all(isinstance(p, dict) for p in fresh_prompts) and all(
|
619
|
-
|
702
|
+
if all(isinstance(p, dict) for p in fresh_prompts) and all(
|
703
|
+
p.get("role") is not None for p in fresh_prompts
|
704
|
+
):
|
705
|
+
population.append(
|
706
|
+
chat_prompt.ChatPrompt(messages=fresh_prompts)
|
707
|
+
)
|
620
708
|
init_pop_report.success_fresh_prompts(1)
|
621
709
|
elif all(isinstance(p, list) for p in fresh_prompts):
|
622
|
-
population.extend(
|
623
|
-
|
710
|
+
population.extend(
|
711
|
+
[
|
712
|
+
chat_prompt.ChatPrompt(messages=p)
|
713
|
+
for p in fresh_prompts[:num_fresh_starts]
|
714
|
+
]
|
715
|
+
)
|
716
|
+
init_pop_report.success_fresh_prompts(
|
717
|
+
len(fresh_prompts[:num_fresh_starts])
|
718
|
+
)
|
624
719
|
else:
|
625
720
|
init_pop_report.failed_fresh_prompts(
|
626
721
|
num_fresh_starts,
|
627
|
-
f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts."
|
722
|
+
f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts.",
|
628
723
|
)
|
629
724
|
except json.JSONDecodeError as e_json:
|
630
725
|
init_pop_report.failed_fresh_prompts(
|
631
726
|
num_fresh_starts,
|
632
|
-
f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts."
|
727
|
+
f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts.",
|
633
728
|
)
|
634
729
|
except Exception as e:
|
635
730
|
init_pop_report.failed_fresh_prompts(
|
636
731
|
num_fresh_starts,
|
637
|
-
f"Error generating fresh start prompts: {e}. Skipping fresh start prompts."
|
732
|
+
f"Error generating fresh start prompts: {e}. Skipping fresh start prompts.",
|
638
733
|
)
|
639
734
|
|
640
735
|
# Generate variations on the initial prompt for the remaining slots
|
641
736
|
# TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
|
642
737
|
if num_variations_on_initial > 0:
|
643
738
|
init_pop_report.start_variations(num_variations_on_initial)
|
644
|
-
|
739
|
+
|
645
740
|
# TODO: We need to split this into batches as the model will not return enough tokens
|
646
741
|
# to generate all the candidates
|
647
742
|
user_prompt_for_variation = f"""Initial prompt:
|
648
|
-
'''{prompt.
|
743
|
+
'''{prompt.get_messages()}'''
|
649
744
|
|
650
745
|
Task context:
|
651
746
|
{task_desc_for_llm}
|
@@ -672,36 +767,61 @@ Return only the new prompt list object.
|
|
672
767
|
try:
|
673
768
|
response_content_variations = self._call_model(
|
674
769
|
messages=[
|
675
|
-
{
|
676
|
-
|
770
|
+
{
|
771
|
+
"role": "system",
|
772
|
+
"content": self._get_reasoning_system_prompt_for_variation(),
|
773
|
+
},
|
774
|
+
{"role": "user", "content": user_prompt_for_variation},
|
677
775
|
],
|
678
|
-
is_reasoning=True
|
776
|
+
is_reasoning=True,
|
777
|
+
)
|
778
|
+
logger.debug(
|
779
|
+
f"Raw response for population variations: {response_content_variations}"
|
679
780
|
)
|
680
|
-
logger.debug(f"Raw response for population variations: {response_content_variations}")
|
681
781
|
json_response_variations = json.loads(response_content_variations)
|
682
|
-
generated_prompts_variations = [
|
683
|
-
|
782
|
+
generated_prompts_variations = [
|
783
|
+
p["prompt"]
|
784
|
+
for p in json_response_variations.get("prompts", [])
|
785
|
+
if isinstance(p, dict) and "prompt" in p
|
786
|
+
]
|
787
|
+
|
684
788
|
if generated_prompts_variations:
|
685
|
-
init_pop_report.success_variations(
|
686
|
-
|
789
|
+
init_pop_report.success_variations(
|
790
|
+
len(
|
791
|
+
generated_prompts_variations[:num_variations_on_initial]
|
792
|
+
)
|
793
|
+
)
|
794
|
+
population.extend(
|
795
|
+
[
|
796
|
+
chat_prompt.ChatPrompt(messages=p)
|
797
|
+
for p in generated_prompts_variations[
|
798
|
+
:num_variations_on_initial
|
799
|
+
]
|
800
|
+
]
|
801
|
+
)
|
687
802
|
else:
|
688
|
-
init_pop_report.failed_variations(
|
803
|
+
init_pop_report.failed_variations(
|
804
|
+
num_variations_on_initial,
|
805
|
+
"Could not parse 'prompts' list for variations. Skipping variations.",
|
806
|
+
)
|
689
807
|
except Exception as e:
|
690
|
-
init_pop_report.failed_variations(
|
808
|
+
init_pop_report.failed_variations(
|
809
|
+
num_variations_on_initial,
|
810
|
+
f"Error calling LLM for initial population variations: {e}",
|
811
|
+
)
|
691
812
|
|
692
813
|
# Ensure population is of the required size using unique prompts
|
693
814
|
# TODO Test with levenshtein distance
|
694
815
|
final_population_set: Set[str] = set()
|
695
816
|
final_population_list: List[chat_prompt.ChatPrompt] = []
|
696
817
|
for p in population:
|
697
|
-
if json.dumps(p.
|
698
|
-
final_population_set.add(json.dumps(p.
|
818
|
+
if json.dumps(p.get_messages()) not in final_population_set:
|
819
|
+
final_population_set.add(json.dumps(p.get_messages()))
|
699
820
|
final_population_list.append(p)
|
700
|
-
|
821
|
+
|
701
822
|
init_pop_report.end(final_population_list)
|
702
823
|
# Return exactly population_size prompts if possible, or fewer if generation failed badly.
|
703
|
-
return final_population_list[:self.population_size]
|
704
|
-
|
824
|
+
return final_population_list[: self.population_size]
|
705
825
|
|
706
826
|
def _should_restart_population(self, curr_best: float) -> bool:
|
707
827
|
"""
|
@@ -709,7 +829,9 @@ Return only the new prompt list object.
|
|
709
829
|
a population restart based on lack of improvement.
|
710
830
|
"""
|
711
831
|
if self._best_primary_score_history:
|
712
|
-
threshold = self._best_primary_score_history[-1] * (
|
832
|
+
threshold = self._best_primary_score_history[-1] * (
|
833
|
+
1 + self.DEFAULT_RESTART_THRESHOLD
|
834
|
+
)
|
713
835
|
if curr_best < threshold:
|
714
836
|
self._gens_since_pop_improvement += 1
|
715
837
|
else:
|
@@ -720,9 +842,9 @@ Return only the new prompt list object.
|
|
720
842
|
def _restart_population(
|
721
843
|
self,
|
722
844
|
hof: tools.HallOfFame,
|
723
|
-
population:
|
845
|
+
population: List[Any],
|
724
846
|
best_prompt_so_far: chat_prompt.ChatPrompt,
|
725
|
-
) ->
|
847
|
+
) -> List[Any]:
|
726
848
|
"""Return a fresh, evaluated population seeded by elites."""
|
727
849
|
if self.enable_moo:
|
728
850
|
elites = list(hof)
|
@@ -730,12 +852,15 @@ Return only the new prompt list object.
|
|
730
852
|
elites = tools.selBest(population, self.elitism_size)
|
731
853
|
|
732
854
|
seed_prompt = (
|
733
|
-
chat_prompt.ChatPrompt(
|
734
|
-
|
855
|
+
chat_prompt.ChatPrompt(
|
856
|
+
messages=max(elites, key=lambda x: x.fitness.values[0])
|
857
|
+
)
|
858
|
+
if elites
|
859
|
+
else best_prompt_so_far
|
735
860
|
)
|
736
861
|
|
737
862
|
prompt_variants = self._initialize_population(seed_prompt)
|
738
|
-
new_pop = [creator.Individual(p.
|
863
|
+
new_pop = [creator.Individual(p.get_messages()) for p in prompt_variants]
|
739
864
|
|
740
865
|
for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
|
741
866
|
ind.fitness.values = fit
|
@@ -746,12 +871,12 @@ Return only the new prompt list object.
|
|
746
871
|
def _run_generation(
|
747
872
|
self,
|
748
873
|
generation_idx: int,
|
749
|
-
population:
|
874
|
+
population: List[Any],
|
750
875
|
prompt: chat_prompt.ChatPrompt,
|
751
876
|
hof: tools.HallOfFame,
|
752
877
|
report: Any,
|
753
878
|
best_primary_score_overall: float,
|
754
|
-
) -> tuple[
|
879
|
+
) -> tuple[List[Any], int]:
|
755
880
|
"""Execute mating, mutation, evaluation and HoF update."""
|
756
881
|
best_gen_score = 0.0
|
757
882
|
|
@@ -760,20 +885,23 @@ Return only the new prompt list object.
|
|
760
885
|
offspring = self.toolbox.select(population, self.population_size)
|
761
886
|
else:
|
762
887
|
elites = tools.selBest(population, self.elitism_size)
|
763
|
-
rest
|
888
|
+
rest = self.toolbox.select(population, len(population) - self.elitism_size)
|
764
889
|
offspring = elites + rest
|
765
890
|
|
766
891
|
# --- crossover -------------------------------------------------
|
767
892
|
report.performing_crossover()
|
768
893
|
offspring = list(map(self.toolbox.clone, offspring))
|
769
894
|
for i in range(0, len(offspring), 2):
|
770
|
-
if i+1 < len(offspring):
|
771
|
-
c1, c2 = offspring[i], offspring[i+1]
|
895
|
+
if i + 1 < len(offspring):
|
896
|
+
c1, c2 = offspring[i], offspring[i + 1]
|
772
897
|
if random.random() < self.crossover_rate:
|
773
898
|
c1_new, c2_new = self.toolbox.mate(c1, c2)
|
774
|
-
offspring[i], offspring[i+1] = c1_new, c2_new
|
775
|
-
del offspring[i].fitness.values, offspring[i+1].fitness.values
|
776
|
-
reporting.display_success(
|
899
|
+
offspring[i], offspring[i + 1] = c1_new, c2_new
|
900
|
+
del offspring[i].fitness.values, offspring[i + 1].fitness.values
|
901
|
+
reporting.display_success(
|
902
|
+
" Crossover successful, prompts have been combined and edited.\n│",
|
903
|
+
verbose=self.verbose,
|
904
|
+
)
|
777
905
|
|
778
906
|
# --- mutation --------------------------------------------------
|
779
907
|
report.performing_mutation()
|
@@ -785,27 +913,40 @@ Return only the new prompt list object.
|
|
785
913
|
offspring[i] = new_ind
|
786
914
|
del offspring[i].fitness.values
|
787
915
|
n_mutations += 1
|
788
|
-
reporting.display_success(
|
789
|
-
|
916
|
+
reporting.display_success(
|
917
|
+
f" Mutation successful, {n_mutations} prompts have been edited.\n│",
|
918
|
+
verbose=self.verbose,
|
919
|
+
)
|
920
|
+
|
790
921
|
# --- evaluation ------------------------------------------------
|
791
922
|
invalid = [ind for ind in offspring if not ind.fitness.valid]
|
792
923
|
report.performing_evaluation(len(invalid))
|
793
924
|
for ind_idx, ind in enumerate(invalid):
|
794
925
|
fit = self.toolbox.evaluate(ind)
|
795
|
-
|
926
|
+
if self.enable_moo:
|
927
|
+
ind.fitness.values = fit
|
928
|
+
else:
|
929
|
+
ind.fitness.values = tuple([fit[0]])
|
796
930
|
best_gen_score = max(best_gen_score, fit[0])
|
797
931
|
|
798
932
|
report.performed_evaluation(ind_idx, ind.fitness.values[0])
|
799
|
-
|
933
|
+
|
800
934
|
# --- update HoF & reporter ------------------------------------
|
801
935
|
hof.update(offspring)
|
802
|
-
reporting.end_gen(
|
803
|
-
|
936
|
+
reporting.end_gen(
|
937
|
+
generation_idx,
|
938
|
+
best_gen_score,
|
939
|
+
best_primary_score_overall,
|
940
|
+
verbose=self.verbose,
|
941
|
+
)
|
942
|
+
|
804
943
|
return offspring, len(invalid)
|
805
944
|
|
806
|
-
def _population_best_score(self, population: List[
|
945
|
+
def _population_best_score(self, population: List[Any]) -> float:
|
807
946
|
"""Return highest primary-objective score among *valid* individuals."""
|
808
|
-
valid_scores = [
|
947
|
+
valid_scores = [
|
948
|
+
ind.fitness.values[0] for ind in population if ind.fitness.valid
|
949
|
+
]
|
809
950
|
return max(valid_scores, default=0.0)
|
810
951
|
|
811
952
|
def optimize_prompt(
|
@@ -816,7 +957,8 @@ Return only the new prompt list object.
|
|
816
957
|
experiment_config: Optional[Dict] = None,
|
817
958
|
n_samples: Optional[int] = None,
|
818
959
|
auto_continue: bool = False,
|
819
|
-
|
960
|
+
agent_class: Optional[Type[OptimizableAgent]] = None,
|
961
|
+
**kwargs: Any,
|
820
962
|
) -> OptimizationResult:
|
821
963
|
"""
|
822
964
|
Args:
|
@@ -828,9 +970,51 @@ Return only the new prompt list object.
|
|
828
970
|
auto_continue: Whether to automatically continue optimization
|
829
971
|
**kwargs: Additional keyword arguments
|
830
972
|
"""
|
831
|
-
|
973
|
+
if not isinstance(prompt, chat_prompt.ChatPrompt):
|
974
|
+
raise ValueError("Prompt must be a ChatPrompt object")
|
975
|
+
|
976
|
+
if not isinstance(dataset, opik.Dataset):
|
977
|
+
raise ValueError("Dataset must be a Dataset object")
|
978
|
+
|
979
|
+
if not callable(metric):
|
980
|
+
raise ValueError(
|
981
|
+
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
982
|
+
)
|
983
|
+
|
984
|
+
if prompt.model is None:
|
985
|
+
prompt.model = self.model
|
986
|
+
if prompt.model_kwargs is None:
|
987
|
+
prompt.model_kwargs = self.model_kwargs
|
988
|
+
|
989
|
+
if agent_class is None:
|
990
|
+
self.agent_class = utils.create_litellm_agent_class(prompt)
|
991
|
+
else:
|
992
|
+
self.agent_class = agent_class
|
993
|
+
|
994
|
+
self.project_name = self.agent_class.project_name
|
995
|
+
|
996
|
+
# Step 0. Start Opik optimization run
|
997
|
+
opik_optimization_run: Optional[optimization.Optimization] = None
|
998
|
+
try:
|
999
|
+
opik_optimization_run = self._opik_client.create_optimization(
|
1000
|
+
dataset_name=dataset.name,
|
1001
|
+
objective_name=metric.__name__,
|
1002
|
+
metadata={"optimizer": self.__class__.__name__},
|
1003
|
+
)
|
1004
|
+
self._current_optimization_id = opik_optimization_run.id
|
1005
|
+
except Exception as e:
|
1006
|
+
logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
|
1007
|
+
self._current_optimization_id = None
|
1008
|
+
|
1009
|
+
reporting.display_header(
|
1010
|
+
algorithm=self.__class__.__name__,
|
1011
|
+
optimization_id=self._current_optimization_id,
|
1012
|
+
dataset_id=dataset.id,
|
1013
|
+
verbose=self.verbose,
|
1014
|
+
)
|
1015
|
+
|
832
1016
|
reporting.display_configuration(
|
833
|
-
prompt.
|
1017
|
+
prompt.get_messages(),
|
834
1018
|
{
|
835
1019
|
"optimizer": f"{ 'DEAP MOO' if self.enable_moo else 'DEAP SO' } Evolutionary Optimization",
|
836
1020
|
"population_size": self.population_size,
|
@@ -838,79 +1022,79 @@ Return only the new prompt list object.
|
|
838
1022
|
"mutation_rate": self.mutation_rate,
|
839
1023
|
"crossover_rate": self.crossover_rate,
|
840
1024
|
},
|
841
|
-
verbose=self.verbose
|
1025
|
+
verbose=self.verbose,
|
842
1026
|
)
|
843
1027
|
|
1028
|
+
# Step 1. Step variables and define fitness function
|
844
1029
|
self.llm_call_counter = 0
|
845
|
-
self._history = []
|
846
|
-
self._current_optimization_id = None
|
1030
|
+
self._history: List[OptimizationRound] = []
|
847
1031
|
self._current_generation = 0
|
848
1032
|
self._best_fitness_history = []
|
849
1033
|
self._generations_without_improvement = 0
|
850
|
-
self._llm_cache.clear()
|
851
1034
|
self._current_population = []
|
852
1035
|
self._generations_without_overall_improvement = 0
|
853
|
-
|
854
|
-
# Step 0. Define fitness function
|
1036
|
+
|
855
1037
|
if self.enable_moo:
|
1038
|
+
|
856
1039
|
def _deap_evaluate_individual_fitness(
|
857
|
-
|
858
|
-
|
859
|
-
primary_fitness_score: float = self.
|
860
|
-
prompt
|
1040
|
+
messages: List[Dict[str, str]],
|
1041
|
+
) -> Tuple[float, float]:
|
1042
|
+
primary_fitness_score: float = self._evaluate_prompt(
|
1043
|
+
prompt,
|
1044
|
+
messages, # type: ignore
|
861
1045
|
dataset=dataset,
|
862
1046
|
metric=metric,
|
863
1047
|
n_samples=n_samples,
|
864
1048
|
experiment_config=(experiment_config or {}).copy(),
|
865
1049
|
optimization_id=self._current_optimization_id,
|
866
|
-
verbose=0
|
1050
|
+
verbose=0,
|
867
1051
|
)
|
868
1052
|
prompt_length = float(len(str(json.dumps(messages))))
|
869
1053
|
return (primary_fitness_score, prompt_length)
|
1054
|
+
|
870
1055
|
else:
|
871
1056
|
# Single-objective
|
872
1057
|
def _deap_evaluate_individual_fitness(
|
873
|
-
|
874
|
-
|
875
|
-
fitness_score: float = self.
|
876
|
-
prompt
|
1058
|
+
messages: List[Dict[str, str]],
|
1059
|
+
) -> Tuple[float, float]:
|
1060
|
+
fitness_score: float = self._evaluate_prompt(
|
1061
|
+
prompt,
|
1062
|
+
messages, # type: ignore
|
877
1063
|
dataset=dataset,
|
878
1064
|
metric=metric,
|
879
1065
|
n_samples=n_samples,
|
880
1066
|
experiment_config=(experiment_config or {}).copy(),
|
881
1067
|
optimization_id=self._current_optimization_id,
|
882
|
-
verbose=0
|
1068
|
+
verbose=0,
|
883
1069
|
)
|
884
|
-
return (fitness_score,)
|
1070
|
+
return (fitness_score, 0.0)
|
1071
|
+
|
885
1072
|
self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
|
886
1073
|
|
887
|
-
# Step
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
1074
|
+
# Step 2. Compute the initial performance of the prompt
|
1075
|
+
with reporting.baseline_performance(
|
1076
|
+
verbose=self.verbose
|
1077
|
+
) as report_baseline_performance:
|
1078
|
+
initial_eval_result = _deap_evaluate_individual_fitness(
|
1079
|
+
prompt.get_messages()
|
1080
|
+
) # type: ignore
|
1081
|
+
initial_primary_score = initial_eval_result[0]
|
1082
|
+
initial_length = (
|
1083
|
+
initial_eval_result[1]
|
1084
|
+
if self.enable_moo
|
1085
|
+
else float(len(json.dumps(prompt.get_messages())))
|
894
1086
|
)
|
895
|
-
self._current_optimization_id = opik_optimization_run.id
|
896
|
-
logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
|
897
|
-
except Exception as e:
|
898
|
-
logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
|
899
1087
|
|
900
|
-
|
901
|
-
with reporting.baseline_performance(verbose=self.verbose) as report_baseline_performance:
|
902
|
-
initial_eval_result: Tuple[float, float] | Tuple[float, ] = _deap_evaluate_individual_fitness(prompt.formatted_messages)
|
903
|
-
initial_primary_score: float = initial_eval_result[0]
|
904
|
-
initial_length: float = initial_eval_result[1] if self.enable_moo else float(len(json.dumps(prompt.formatted_messages)))
|
905
|
-
|
906
|
-
best_primary_score_overall: float = initial_primary_score
|
1088
|
+
best_primary_score_overall = initial_primary_score
|
907
1089
|
best_prompt_overall = prompt
|
908
1090
|
report_baseline_performance.set_score(initial_primary_score)
|
909
|
-
|
1091
|
+
|
910
1092
|
# Step 3. Define the output style guide
|
911
1093
|
effective_output_style_guidance = self.output_style_guidance
|
912
|
-
if self.infer_output_style and
|
913
|
-
|
1094
|
+
if self.infer_output_style and (
|
1095
|
+
self.output_style_guidance is None
|
1096
|
+
or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
1097
|
+
):
|
914
1098
|
# If user wants inference AND hasn't provided a specific custom guidance
|
915
1099
|
inferred_style = self._infer_output_style_from_dataset(dataset, prompt)
|
916
1100
|
if inferred_style:
|
@@ -918,22 +1102,26 @@ Return only the new prompt list object.
|
|
918
1102
|
# Update self.output_style_guidance for this run so dynamic prompt methods use it
|
919
1103
|
self.output_style_guidance = inferred_style
|
920
1104
|
else:
|
921
|
-
logger.warning(
|
1105
|
+
logger.warning(
|
1106
|
+
"Failed to infer output style, using default or user-provided guidance."
|
1107
|
+
)
|
922
1108
|
|
923
1109
|
# Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
|
924
1110
|
# (It might have been None if user passed None and infer_output_style was False)
|
925
1111
|
if self.output_style_guidance is None:
|
926
1112
|
# Fallback if still None
|
927
1113
|
self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
928
|
-
|
1114
|
+
|
929
1115
|
# Step 4. Initialize population
|
930
1116
|
initial_prompts: List[chat_prompt.ChatPrompt] = self._initialize_population(
|
931
1117
|
prompt=prompt
|
932
1118
|
)
|
933
|
-
|
934
|
-
deap_population = [
|
935
|
-
|
936
|
-
|
1119
|
+
|
1120
|
+
deap_population = [
|
1121
|
+
creator.Individual(p.get_messages()) for p in initial_prompts
|
1122
|
+
]
|
1123
|
+
deap_population = deap_population[: self.population_size]
|
1124
|
+
|
937
1125
|
# Step 5. Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
|
938
1126
|
if self.enable_moo:
|
939
1127
|
hof = tools.ParetoFront()
|
@@ -942,44 +1130,72 @@ Return only the new prompt list object.
|
|
942
1130
|
hof = tools.HallOfFame(self.DEFAULT_HALL_OF_FAME_SIZE)
|
943
1131
|
|
944
1132
|
# Step 6. Evaluate the initial population
|
945
|
-
with reporting.evaluate_initial_population(
|
946
|
-
|
947
|
-
|
1133
|
+
with reporting.evaluate_initial_population(
|
1134
|
+
verbose=self.verbose
|
1135
|
+
) as report_initial_population:
|
1136
|
+
fitnesses: List[Any] = list(map(self.toolbox.evaluate, deap_population))
|
1137
|
+
_best_score = max(
|
1138
|
+
best_primary_score_overall, max([x[0] for x in fitnesses])
|
1139
|
+
)
|
948
1140
|
|
949
|
-
for i, ind, fit in zip(
|
950
|
-
|
1141
|
+
for i, ind, fit in zip(
|
1142
|
+
range(len(deap_population)), deap_population, fitnesses
|
1143
|
+
):
|
1144
|
+
if self.enable_moo:
|
1145
|
+
ind.fitness.values = fit
|
1146
|
+
else:
|
1147
|
+
ind.fitness.values = tuple([fit[0]])
|
951
1148
|
report_initial_population.set_score(i, fit[0], _best_score)
|
952
|
-
|
1149
|
+
|
953
1150
|
hof.update(deap_population)
|
954
|
-
|
1151
|
+
|
955
1152
|
if hof and len(hof) > 0:
|
956
1153
|
if self.enable_moo:
|
957
|
-
current_best_for_primary:
|
958
|
-
|
959
|
-
|
1154
|
+
current_best_for_primary: Any = max(
|
1155
|
+
hof, key=lambda ind: ind.fitness.values[0]
|
1156
|
+
)
|
1157
|
+
best_primary_score_overall = current_best_for_primary.fitness.values[0]
|
1158
|
+
best_prompt_overall = chat_prompt.ChatPrompt(
|
1159
|
+
messages=current_best_for_primary
|
1160
|
+
)
|
960
1161
|
else:
|
961
1162
|
# Single-objective
|
962
1163
|
current_best_on_front = hof[0]
|
963
|
-
best_primary_score_overall
|
964
|
-
|
1164
|
+
best_primary_score_overall = current_best_on_front.fitness.values[0]
|
1165
|
+
best_prompt_overall = chat_prompt.ChatPrompt(
|
1166
|
+
messages=current_best_on_front
|
1167
|
+
)
|
1168
|
+
|
965
1169
|
if self.enable_moo:
|
966
|
-
logger.info(
|
1170
|
+
logger.info(
|
1171
|
+
f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {json.dumps(best_prompt_overall.get_messages())[:100]}..."
|
1172
|
+
)
|
967
1173
|
else:
|
968
|
-
logger.info(
|
1174
|
+
logger.info(
|
1175
|
+
f"Gen {0}: New best score: {best_primary_score_overall:.4f}"
|
1176
|
+
)
|
969
1177
|
|
970
1178
|
# Simplified history logging for this transition
|
971
1179
|
initial_round_data = OptimizationRound(
|
972
1180
|
round_number=0,
|
973
|
-
current_prompt=best_prompt_overall,
|
1181
|
+
current_prompt=best_prompt_overall, # Representative best
|
974
1182
|
current_score=best_primary_score_overall,
|
975
|
-
generated_prompts=[
|
1183
|
+
generated_prompts=[
|
1184
|
+
{
|
1185
|
+
"prompt": best_prompt_overall,
|
1186
|
+
"score": best_primary_score_overall,
|
1187
|
+
"trial_scores": [best_primary_score_overall],
|
1188
|
+
}
|
1189
|
+
],
|
976
1190
|
best_prompt=best_prompt_overall,
|
977
1191
|
best_score=best_primary_score_overall,
|
978
|
-
improvement=0.0
|
979
|
-
)
|
1192
|
+
improvement=0.0,
|
1193
|
+
)
|
980
1194
|
self._add_to_history(initial_round_data)
|
981
1195
|
|
982
|
-
with reporting.start_evolutionary_algo(
|
1196
|
+
with reporting.start_evolutionary_algo(
|
1197
|
+
verbose=self.verbose
|
1198
|
+
) as report_evolutionary_algo:
|
983
1199
|
for generation_idx in range(1, self.num_generations + 1):
|
984
1200
|
report_evolutionary_algo.start_gen(generation_idx, self.num_generations)
|
985
1201
|
|
@@ -987,21 +1203,30 @@ Return only the new prompt list object.
|
|
987
1203
|
|
988
1204
|
# ---------- restart logic -------------------------------------
|
989
1205
|
if self._should_restart_population(curr_best_score):
|
990
|
-
report_evolutionary_algo.restart_population(
|
1206
|
+
report_evolutionary_algo.restart_population(
|
1207
|
+
self.DEFAULT_RESTART_GENERATIONS
|
1208
|
+
)
|
991
1209
|
deap_population = self._restart_population(
|
992
1210
|
hof, deap_population, best_prompt_overall
|
993
1211
|
)
|
994
1212
|
|
995
1213
|
# ---------- run one generation --------------------------------
|
996
1214
|
deap_population, invalid_count = self._run_generation(
|
997
|
-
generation_idx,
|
1215
|
+
generation_idx,
|
1216
|
+
deap_population,
|
1217
|
+
prompt,
|
1218
|
+
hof,
|
1219
|
+
report_evolutionary_algo,
|
1220
|
+
best_primary_score_overall,
|
998
1221
|
)
|
999
1222
|
|
1000
1223
|
# -------- update best-prompt bookkeeping -------------------------
|
1001
1224
|
previous_best_primary_score_for_gen = best_primary_score_overall
|
1002
1225
|
if hof:
|
1003
1226
|
if self.enable_moo:
|
1004
|
-
current_best_ind = max(
|
1227
|
+
current_best_ind = max(
|
1228
|
+
hof, key=lambda ind: ind.fitness.values[0]
|
1229
|
+
)
|
1005
1230
|
else:
|
1006
1231
|
current_best_ind = hof[0]
|
1007
1232
|
|
@@ -1009,7 +1234,10 @@ Return only the new prompt list object.
|
|
1009
1234
|
if updated_best_primary_score > best_primary_score_overall:
|
1010
1235
|
best_primary_score_overall = updated_best_primary_score
|
1011
1236
|
self._generations_without_overall_improvement = 0
|
1012
|
-
elif
|
1237
|
+
elif (
|
1238
|
+
updated_best_primary_score
|
1239
|
+
== previous_best_primary_score_for_gen
|
1240
|
+
):
|
1013
1241
|
self._generations_without_overall_improvement += 1
|
1014
1242
|
else:
|
1015
1243
|
self._generations_without_overall_improvement += 1
|
@@ -1017,7 +1245,10 @@ Return only the new prompt list object.
|
|
1017
1245
|
self._generations_without_overall_improvement += 1
|
1018
1246
|
|
1019
1247
|
# ---------- early-stopping check ------------------------------
|
1020
|
-
if
|
1248
|
+
if (
|
1249
|
+
self._generations_without_overall_improvement
|
1250
|
+
>= self.DEFAULT_EARLY_STOPPING_GENERATIONS
|
1251
|
+
):
|
1021
1252
|
logger.info(
|
1022
1253
|
"No overall improvement for %d generations – early stopping at gen %d.",
|
1023
1254
|
self.DEFAULT_EARLY_STOPPING_GENERATIONS,
|
@@ -1029,114 +1260,168 @@ Return only the new prompt list object.
|
|
1029
1260
|
# FIXME: Use model.dump() instead of dict()
|
1030
1261
|
gen_round_data = OptimizationRound(
|
1031
1262
|
round_number=generation_idx,
|
1032
|
-
current_prompt=best_prompt_overall,
|
1263
|
+
current_prompt=best_prompt_overall, # Representative best
|
1033
1264
|
current_score=best_primary_score_overall,
|
1034
|
-
generated_prompts=[
|
1265
|
+
generated_prompts=[
|
1266
|
+
{"prompt": str(ind), "score": ind.fitness.values[0]}
|
1267
|
+
for ind in deap_population
|
1268
|
+
if ind.fitness.valid
|
1269
|
+
],
|
1035
1270
|
best_prompt=best_prompt_overall,
|
1036
1271
|
best_score=best_primary_score_overall,
|
1037
|
-
improvement=(
|
1038
|
-
|
1272
|
+
improvement=(
|
1273
|
+
(best_primary_score_overall - initial_primary_score)
|
1274
|
+
/ abs(initial_primary_score)
|
1275
|
+
if initial_primary_score and initial_primary_score != 0
|
1276
|
+
else (1.0 if best_primary_score_overall > 0 else 0.0)
|
1277
|
+
),
|
1278
|
+
)
|
1039
1279
|
self._add_to_history(gen_round_data)
|
1040
1280
|
|
1041
|
-
stopped_early_flag =
|
1281
|
+
stopped_early_flag = (
|
1282
|
+
self._generations_without_overall_improvement
|
1283
|
+
>= self.DEFAULT_EARLY_STOPPING_GENERATIONS
|
1284
|
+
)
|
1042
1285
|
final_details = {}
|
1043
1286
|
initial_score_for_display = initial_primary_score
|
1044
1287
|
|
1045
1288
|
if self.enable_moo:
|
1046
1289
|
final_results_log = "Pareto Front Solutions:\n"
|
1047
1290
|
if hof and len(hof) > 0:
|
1048
|
-
sorted_hof = sorted(
|
1291
|
+
sorted_hof = sorted(
|
1292
|
+
hof, key=lambda ind: ind.fitness.values[0], reverse=True
|
1293
|
+
)
|
1049
1294
|
for i, sol in enumerate(sorted_hof):
|
1050
1295
|
final_results_log += f" Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
|
1051
1296
|
best_overall_solution = sorted_hof[0]
|
1052
|
-
final_best_prompt = chat_prompt.ChatPrompt(
|
1297
|
+
final_best_prompt = chat_prompt.ChatPrompt(
|
1298
|
+
messages=best_overall_solution
|
1299
|
+
)
|
1053
1300
|
final_primary_score = best_overall_solution.fitness.values[0]
|
1054
1301
|
final_length = best_overall_solution.fitness.values[1]
|
1055
1302
|
logger.info(final_results_log)
|
1056
|
-
logger.info(
|
1057
|
-
|
1303
|
+
logger.info(
|
1304
|
+
f"Representative best prompt (highest primary score from Pareto front): '{final_best_prompt}'"
|
1305
|
+
)
|
1306
|
+
logger.info(
|
1307
|
+
f" Primary Score ({metric.__name__}): {final_primary_score:.4f}"
|
1308
|
+
)
|
1058
1309
|
logger.info(f" Length: {final_length:.0f}")
|
1059
|
-
final_details.update(
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1310
|
+
final_details.update(
|
1311
|
+
{
|
1312
|
+
"initial_primary_score": initial_primary_score,
|
1313
|
+
"initial_length": initial_length,
|
1314
|
+
"final_prompt_representative": final_best_prompt,
|
1315
|
+
"final_primary_score_representative": final_primary_score,
|
1316
|
+
"final_length_representative": final_length,
|
1317
|
+
"pareto_front_solutions": (
|
1318
|
+
[
|
1319
|
+
{
|
1320
|
+
"prompt": str(ind),
|
1321
|
+
"score": ind.fitness.values[0],
|
1322
|
+
"length": ind.fitness.values[1],
|
1323
|
+
}
|
1324
|
+
for ind in hof
|
1325
|
+
]
|
1326
|
+
if hof
|
1327
|
+
else []
|
1328
|
+
),
|
1329
|
+
}
|
1330
|
+
)
|
1070
1331
|
else:
|
1071
1332
|
# MOO: ParetoFront is empty. Reporting last known best and fallback values
|
1072
1333
|
logger.warning("MOO: ParetoFront is empty. Reporting last known best.")
|
1073
1334
|
final_best_prompt = best_prompt_overall
|
1074
1335
|
final_primary_score = best_primary_score_overall
|
1075
|
-
final_length = float(len(json.dumps(final_best_prompt.
|
1076
|
-
final_details.update(
|
1077
|
-
|
1078
|
-
|
1336
|
+
final_length = float(len(json.dumps(final_best_prompt.get_messages())))
|
1337
|
+
final_details.update(
|
1338
|
+
{
|
1339
|
+
"initial_primary_score": initial_primary_score,
|
1340
|
+
"initial_length": initial_length,
|
1341
|
+
"final_prompt_representative": final_best_prompt,
|
1342
|
+
"final_primary_score_representative": final_primary_score,
|
1343
|
+
"final_length_representative": final_length,
|
1344
|
+
"pareto_front_solutions": [],
|
1345
|
+
}
|
1346
|
+
)
|
1079
1347
|
else:
|
1080
1348
|
# Single-objective
|
1081
1349
|
final_best_prompt = best_prompt_overall
|
1082
1350
|
final_primary_score = best_primary_score_overall
|
1083
1351
|
logger.info(f"Final best prompt from Hall of Fame: '{final_best_prompt}'")
|
1084
|
-
logger.info(
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1352
|
+
logger.info(
|
1353
|
+
f"Final best score ({metric.__name__}): {final_primary_score:.4f}"
|
1354
|
+
)
|
1355
|
+
final_details.update(
|
1356
|
+
{
|
1357
|
+
"initial_prompt": prompt.get_messages(),
|
1358
|
+
"initial_score": initial_primary_score,
|
1359
|
+
"initial_score_for_display": initial_primary_score,
|
1360
|
+
"final_prompt": final_best_prompt,
|
1361
|
+
"final_score": final_primary_score,
|
1362
|
+
}
|
1363
|
+
)
|
1364
|
+
|
1093
1365
|
logger.info(f"Total LLM calls during optimization: {self.llm_call_counter}")
|
1094
1366
|
if opik_optimization_run:
|
1095
1367
|
try:
|
1096
1368
|
opik_optimization_run.update(status="completed")
|
1097
|
-
logger.info(
|
1369
|
+
logger.info(
|
1370
|
+
f"Opik Optimization run {self._current_optimization_id} status updated to completed."
|
1371
|
+
)
|
1098
1372
|
except Exception as e:
|
1099
1373
|
logger.warning(f"Failed to update Opik Optimization run status: {e}")
|
1100
1374
|
|
1101
1375
|
# Add final details
|
1102
|
-
final_details.update(
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1376
|
+
final_details.update(
|
1377
|
+
{
|
1378
|
+
"total_generations_run": generation_idx + 1,
|
1379
|
+
"num_generations": self.num_generations,
|
1380
|
+
"population_size": self.population_size,
|
1381
|
+
"mutation_probability": self.mutation_rate,
|
1382
|
+
"crossover_probability": self.crossover_rate,
|
1383
|
+
"elitism_size": (
|
1384
|
+
self.elitism_size
|
1385
|
+
if not self.enable_moo
|
1386
|
+
else "N/A (MOO uses NSGA-II)"
|
1387
|
+
),
|
1388
|
+
"adaptive_mutation": self.adaptive_mutation,
|
1389
|
+
"metric_name": metric.__name__,
|
1390
|
+
"model": self.model,
|
1391
|
+
"moo_enabled": self.enable_moo,
|
1392
|
+
"llm_crossover_enabled": self.enable_llm_crossover,
|
1393
|
+
"seed": self.seed,
|
1394
|
+
"prompt_type": "single_string_ga",
|
1395
|
+
"initial_score_for_display": initial_score_for_display,
|
1396
|
+
"temperature": self.model_kwargs.get("temperature"),
|
1397
|
+
"stopped_early": stopped_early_flag,
|
1398
|
+
"rounds": self.get_history(),
|
1399
|
+
"user_output_style_guidance": self.output_style_guidance,
|
1400
|
+
"infer_output_style_requested": self.infer_output_style,
|
1401
|
+
"final_effective_output_style_guidance": effective_output_style_guidance,
|
1402
|
+
"infer_output_style": self.infer_output_style,
|
1403
|
+
}
|
1404
|
+
)
|
1124
1405
|
|
1125
1406
|
# Return the OptimizationResult
|
1126
1407
|
reporting.display_result(
|
1127
1408
|
initial_score=initial_score_for_display,
|
1128
1409
|
best_score=final_primary_score,
|
1129
|
-
best_prompt=final_best_prompt.
|
1130
|
-
verbose=self.verbose
|
1410
|
+
best_prompt=final_best_prompt.get_messages(),
|
1411
|
+
verbose=self.verbose,
|
1131
1412
|
)
|
1132
1413
|
return OptimizationResult(
|
1133
1414
|
optimizer=self.__class__.__name__,
|
1134
|
-
prompt=final_best_prompt.
|
1135
|
-
score=final_primary_score,
|
1415
|
+
prompt=final_best_prompt.get_messages(),
|
1416
|
+
score=final_primary_score,
|
1417
|
+
initial_prompt=prompt.get_messages(),
|
1418
|
+
initial_score=initial_primary_score,
|
1136
1419
|
metric_name=metric.__name__,
|
1137
1420
|
details=final_details,
|
1138
|
-
history=self.get_history(),
|
1139
|
-
llm_calls=self.llm_call_counter
|
1421
|
+
history=[x.model_dump() for x in self.get_history()],
|
1422
|
+
llm_calls=self.llm_call_counter,
|
1423
|
+
dataset_id=dataset.id,
|
1424
|
+
optimization_id=self._current_optimization_id,
|
1140
1425
|
)
|
1141
1426
|
|
1142
1427
|
@_throttle.rate_limited(_rate_limiter)
|
@@ -1158,7 +1443,7 @@ Return only the new prompt list object.
|
|
1158
1443
|
}
|
1159
1444
|
|
1160
1445
|
# Prepare metadata for opik
|
1161
|
-
metadata_for_opik = {}
|
1446
|
+
metadata_for_opik: Dict[str, Any] = {}
|
1162
1447
|
if self.project_name:
|
1163
1448
|
metadata_for_opik["project_name"] = self.project_name
|
1164
1449
|
metadata_for_opik["opik"] = {"project_name": self.project_name}
|
@@ -1168,7 +1453,9 @@ Return only the new prompt list object.
|
|
1168
1453
|
metadata_for_opik["opik"]["optimization_id"] = optimization_id
|
1169
1454
|
|
1170
1455
|
metadata_for_opik["optimizer_name"] = self.__class__.__name__
|
1171
|
-
metadata_for_opik["opik_call_type"] =
|
1456
|
+
metadata_for_opik["opik_call_type"] = (
|
1457
|
+
"reasoning" if is_reasoning else "evaluation_llm_task_direct"
|
1458
|
+
)
|
1172
1459
|
|
1173
1460
|
if metadata_for_opik:
|
1174
1461
|
llm_config_params["metadata"] = metadata_for_opik
|
@@ -1186,6 +1473,7 @@ Return only the new prompt list object.
|
|
1186
1473
|
response = litellm.completion(
|
1187
1474
|
model=self.model, messages=messages, **final_call_params
|
1188
1475
|
)
|
1476
|
+
self.llm_call_counter += 1
|
1189
1477
|
|
1190
1478
|
logger.debug(f"Response: {response}")
|
1191
1479
|
return response.choices[0].message.content
|
@@ -1199,12 +1487,15 @@ Return only the new prompt list object.
|
|
1199
1487
|
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
1200
1488
|
raise
|
1201
1489
|
except Exception as e:
|
1202
|
-
logger.error(
|
1490
|
+
logger.error(
|
1491
|
+
f"Error calling model '{self.model}': {type(e).__name__} - {e}"
|
1492
|
+
)
|
1203
1493
|
raise
|
1204
1494
|
|
1205
|
-
def
|
1495
|
+
def _evaluate_prompt(
|
1206
1496
|
self,
|
1207
1497
|
prompt: chat_prompt.ChatPrompt,
|
1498
|
+
messages: List[Dict[str, str]],
|
1208
1499
|
dataset: opik.Dataset,
|
1209
1500
|
metric: Callable,
|
1210
1501
|
n_samples: Optional[int] = None,
|
@@ -1212,12 +1503,13 @@ Return only the new prompt list object.
|
|
1212
1503
|
experiment_config: Optional[Dict] = None,
|
1213
1504
|
optimization_id: Optional[str] = None,
|
1214
1505
|
verbose: int = 0,
|
1506
|
+
**kwargs: Any,
|
1215
1507
|
) -> float:
|
1216
1508
|
"""
|
1217
1509
|
Evaluate a single prompt (individual) against the dataset.
|
1218
|
-
|
1510
|
+
|
1219
1511
|
Args:
|
1220
|
-
prompt:
|
1512
|
+
prompt:
|
1221
1513
|
dataset: The dataset to use for evaluation
|
1222
1514
|
metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
|
1223
1515
|
n_samples: Optional number of samples to use
|
@@ -1225,46 +1517,44 @@ Return only the new prompt list object.
|
|
1225
1517
|
experiment_config: Optional experiment configuration
|
1226
1518
|
optimization_id: Optional optimization ID
|
1227
1519
|
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
1228
|
-
|
1520
|
+
|
1229
1521
|
Returns:
|
1230
1522
|
float: The metric value
|
1231
1523
|
"""
|
1232
1524
|
total_items = len(dataset.get_items())
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
**
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1525
|
+
|
1526
|
+
experiment_config = experiment_config or {}
|
1527
|
+
experiment_config["project_name"] = self.agent_class.project_name
|
1528
|
+
experiment_config = {
|
1529
|
+
**experiment_config,
|
1530
|
+
"optimizer": self.__class__.__name__,
|
1531
|
+
"agent_class": self.agent_class.__name__,
|
1532
|
+
"agent_config": prompt.to_dict(),
|
1533
|
+
"metric": metric.__name__,
|
1534
|
+
"dataset": dataset.name,
|
1535
|
+
"configuration": {
|
1536
|
+
"prompt": prompt.get_messages(),
|
1537
|
+
"n_samples_for_eval": (
|
1538
|
+
len(dataset_item_ids) if dataset_item_ids is not None else n_samples
|
1539
|
+
),
|
1540
|
+
"total_dataset_items": total_items,
|
1246
1541
|
},
|
1247
1542
|
}
|
1248
1543
|
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
model_output = self._call_model(
|
1262
|
-
messages=messages,
|
1263
|
-
is_reasoning=False
|
1264
|
-
)
|
1265
|
-
|
1544
|
+
new_prompt = prompt.copy()
|
1545
|
+
new_prompt.set_messages(messages)
|
1546
|
+
try:
|
1547
|
+
agent = self.agent_class(new_prompt)
|
1548
|
+
except Exception:
|
1549
|
+
return 0.0
|
1550
|
+
|
1551
|
+
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
|
1552
|
+
# print("MESSAGES:", new_prompt.messages)
|
1553
|
+
messages = new_prompt.get_messages(dataset_item)
|
1554
|
+
model_output = agent.invoke(messages)
|
1555
|
+
# print("OUTPUT:", model_output)
|
1266
1556
|
return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
|
1267
|
-
|
1557
|
+
|
1268
1558
|
# Evaluate the prompt
|
1269
1559
|
score = task_evaluator.evaluate(
|
1270
1560
|
dataset=dataset,
|
@@ -1272,24 +1562,22 @@ Return only the new prompt list object.
|
|
1272
1562
|
metric=metric,
|
1273
1563
|
evaluated_task=llm_task,
|
1274
1564
|
num_threads=self.num_threads,
|
1275
|
-
project_name=
|
1565
|
+
project_name=experiment_config["project_name"],
|
1276
1566
|
n_samples=n_samples if dataset_item_ids is None else None,
|
1277
|
-
experiment_config=
|
1567
|
+
experiment_config=experiment_config,
|
1278
1568
|
optimization_id=optimization_id,
|
1279
|
-
verbose=verbose
|
1569
|
+
verbose=verbose,
|
1280
1570
|
)
|
1281
1571
|
return score
|
1282
1572
|
|
1283
|
-
def _llm_deap_crossover(
|
1284
|
-
self,
|
1285
|
-
ind1: "creator.Individual",
|
1286
|
-
ind2: "creator.Individual"
|
1287
|
-
) -> Tuple["creator.Individual", "creator.Individual"]:
|
1573
|
+
def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
|
1288
1574
|
"""Perform crossover by asking an LLM to blend two parent prompts."""
|
1289
|
-
reporting.display_message(
|
1575
|
+
reporting.display_message(
|
1576
|
+
" Recombining prompts using an LLM.", verbose=self.verbose
|
1577
|
+
)
|
1290
1578
|
|
1291
|
-
parent1_messages: List[Dict[
|
1292
|
-
parent2_messages: List[Dict[
|
1579
|
+
parent1_messages: List[Dict[str, str]] = ind1
|
1580
|
+
parent2_messages: List[Dict[str, str]] = ind2
|
1293
1581
|
current_output_style_guidance = self.output_style_guidance
|
1294
1582
|
|
1295
1583
|
user_prompt_for_llm_crossover = f"""Parent Prompt 1:
|
@@ -1308,38 +1596,50 @@ Follow the instructions provided in the system prompt regarding the JSON output
|
|
1308
1596
|
]
|
1309
1597
|
"""
|
1310
1598
|
try:
|
1311
|
-
logger.debug(
|
1599
|
+
logger.debug(
|
1600
|
+
f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'"
|
1601
|
+
)
|
1312
1602
|
response_content = self._call_model(
|
1313
1603
|
messages=[
|
1314
|
-
{
|
1604
|
+
{
|
1605
|
+
"role": "system",
|
1606
|
+
"content": self.get_llm_crossover_system_prompt(),
|
1607
|
+
},
|
1315
1608
|
{"role": "user", "content": user_prompt_for_llm_crossover},
|
1316
1609
|
],
|
1317
|
-
is_reasoning=True
|
1610
|
+
is_reasoning=True,
|
1318
1611
|
)
|
1319
1612
|
logger.debug(f"Raw LLM response for crossover: {response_content}")
|
1320
1613
|
|
1321
1614
|
json_response = utils.json_to_dict(response_content)
|
1322
|
-
if
|
1323
|
-
|
1615
|
+
if (
|
1616
|
+
not isinstance(json_response, list)
|
1617
|
+
or len(json_response) != 2
|
1618
|
+
or not all(isinstance(cs, list) for cs in json_response)
|
1619
|
+
):
|
1620
|
+
logger.warning(
|
1621
|
+
"LLM Crossover: Malformed or empty children_prompts list. Falling back."
|
1622
|
+
)
|
1324
1623
|
raise ValueError("Malformed LLM crossover response")
|
1325
1624
|
|
1326
|
-
child1: List[Dict[
|
1327
|
-
child2: List[Dict[
|
1328
|
-
|
1329
|
-
logger.debug(
|
1330
|
-
|
1625
|
+
child1: List[Dict[str, str]] = json_response[0]
|
1626
|
+
child2: List[Dict[str, str]] = json_response[1]
|
1627
|
+
|
1628
|
+
logger.debug(
|
1629
|
+
f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}..."
|
1630
|
+
)
|
1631
|
+
return creator.Individual(child1), creator.Individual(child2)
|
1331
1632
|
|
1332
1633
|
except Exception as e:
|
1333
|
-
logger.warning(
|
1634
|
+
logger.warning(
|
1635
|
+
f"LLM-driven crossover failed: {e}. Falling back to standard crossover."
|
1636
|
+
)
|
1334
1637
|
return self._deap_crossover(ind1, ind2)
|
1335
1638
|
|
1336
|
-
def _get_task_description_for_llm(
|
1337
|
-
self,
|
1338
|
-
prompt: chat_prompt.ChatPrompt
|
1339
|
-
) -> str:
|
1639
|
+
def _get_task_description_for_llm(self, prompt: chat_prompt.ChatPrompt) -> str:
|
1340
1640
|
"""Generates a concise task description for use in LLM prompts for fresh generation or radical innovation."""
|
1341
1641
|
description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
|
1342
|
-
description += f"The original high-level instruction being optimized is: '{prompt.
|
1642
|
+
description += f"The original high-level instruction being optimized is: '{prompt.get_messages()}'. "
|
1343
1643
|
description += "The goal is to create an effective prompt that guides a language model to perform this task well."
|
1344
1644
|
return description
|
1345
1645
|
|
@@ -1375,8 +1675,8 @@ Each prompt variation should aim to get the target LLM to produce answers matchi
|
|
1375
1675
|
"""
|
1376
1676
|
|
1377
1677
|
def get_llm_crossover_system_prompt(self) -> str:
|
1378
|
-
return f"""You are an expert prompt engineer specializing in creating novel prompts by intelligently blending existing ones.
|
1379
|
-
Given two parent prompts, your task is to generate one or two new child prompts that effectively combine the strengths, styles, or core ideas of both parents.
|
1678
|
+
return f"""You are an expert prompt engineer specializing in creating novel prompts by intelligently blending existing ones.
|
1679
|
+
Given two parent prompts, your task is to generate one or two new child prompts that effectively combine the strengths, styles, or core ideas of both parents.
|
1380
1680
|
The children should be coherent and aim to explore a potentially more effective region of the prompt design space, with a key goal of eliciting responses from the target language model in the following style: '{self.output_style_guidance}'.
|
1381
1681
|
|
1382
1682
|
Consider the following when generating children:
|
@@ -1397,69 +1697,80 @@ Return a JSON object that is a list of both child prompts. Each child prompt is
|
|
1397
1697
|
"""
|
1398
1698
|
|
1399
1699
|
def _get_radical_innovation_system_prompt(self) -> str:
|
1400
|
-
return f"""You are an expert prompt engineer and a creative problem solver.
|
1401
|
-
Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
|
1402
|
-
Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
|
1700
|
+
return f"""You are an expert prompt engineer and a creative problem solver.
|
1701
|
+
Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
|
1702
|
+
Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
|
1403
1703
|
Consider clarity, specificity, constraints, and how to best guide the language model for the described task TO PRODUCE OUTPUTS IN THE FOLLOWING STYLE: '{self.output_style_guidance}'.
|
1404
1704
|
Return only the new prompt string, with no preamble or explanation.
|
1405
1705
|
"""
|
1406
1706
|
|
1407
1707
|
def _infer_output_style_from_dataset(
|
1408
|
-
|
1409
|
-
|
1410
|
-
prompt: chat_prompt.ChatPrompt,
|
1411
|
-
n_examples: int = 5
|
1412
|
-
) -> Optional[str]:
|
1708
|
+
self, dataset: opik.Dataset, prompt: chat_prompt.ChatPrompt, n_examples: int = 5
|
1709
|
+
) -> Optional[str]:
|
1413
1710
|
"""Analyzes dataset examples to infer the desired output style."""
|
1414
|
-
with reporting.infer_output_style(
|
1415
|
-
|
1416
|
-
|
1711
|
+
with reporting.infer_output_style(
|
1712
|
+
verbose=self.verbose
|
1713
|
+
) as report_infer_output_style:
|
1714
|
+
report_infer_output_style.start_style_inference()
|
1715
|
+
|
1417
1716
|
try:
|
1418
1717
|
items_to_process = dataset.get_items(n_examples)
|
1419
1718
|
except Exception as e:
|
1420
|
-
report_infer_output_style.error(
|
1719
|
+
report_infer_output_style.error(
|
1720
|
+
f"Failed to get items from dataset '{dataset.name}': {e}"
|
1721
|
+
)
|
1421
1722
|
return None
|
1422
1723
|
|
1423
1724
|
if not items_to_process:
|
1424
|
-
report_infer_output_style.error(
|
1725
|
+
report_infer_output_style.error(
|
1726
|
+
f"Dataset '{dataset.name}' is empty. Cannot infer output style."
|
1727
|
+
)
|
1425
1728
|
return None
|
1426
1729
|
|
1427
1730
|
# Need at least a couple of examples for meaningful inference
|
1428
1731
|
if len(items_to_process) < min(n_examples, 2):
|
1429
|
-
report_infer_output_style.error(
|
1732
|
+
report_infer_output_style.error(
|
1733
|
+
f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}."
|
1734
|
+
)
|
1430
1735
|
return None
|
1431
1736
|
|
1432
1737
|
examples_str = ""
|
1433
1738
|
for i, item_content in enumerate(items_to_process):
|
1434
1739
|
filtered_content = {x: y for x, y in item_content.items() if x != "id"}
|
1435
|
-
examples_str +=
|
1740
|
+
examples_str += (
|
1741
|
+
f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
|
1742
|
+
)
|
1436
1743
|
|
1437
1744
|
user_prompt_for_style_inference = f"""Please analyze the following examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. Before describing the output style, make sure to understand the dataset content and structure as it can include input, output and metadata fields. This description will be used to guide other LLMs in generating and refining prompts.
|
1438
1745
|
|
1439
1746
|
{examples_str}
|
1440
1747
|
|
1441
|
-
Based on these examples, what is the desired output style description?
|
1442
|
-
Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
|
1748
|
+
Based on these examples, what is the desired output style description?
|
1749
|
+
Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
|
1443
1750
|
The description should be a single string that can be directly used as an instruction for another LLM.
|
1444
1751
|
Return ONLY this descriptive string.
|
1445
1752
|
"""
|
1446
|
-
#report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
|
1753
|
+
# report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
|
1447
1754
|
|
1448
1755
|
try:
|
1449
1756
|
inferred_style = self._call_model(
|
1450
1757
|
messages=[
|
1451
1758
|
{"role": "system", "content": self._INFER_STYLE_SYSTEM_PROMPT},
|
1452
|
-
{"role": "user", "content": user_prompt_for_style_inference}
|
1759
|
+
{"role": "user", "content": user_prompt_for_style_inference},
|
1453
1760
|
],
|
1454
|
-
is_reasoning=True
|
1761
|
+
is_reasoning=True,
|
1455
1762
|
)
|
1456
1763
|
inferred_style = inferred_style.strip()
|
1457
1764
|
if inferred_style:
|
1458
1765
|
report_infer_output_style.success(inferred_style)
|
1459
1766
|
return inferred_style
|
1460
1767
|
else:
|
1461
|
-
report_infer_output_style.error(
|
1768
|
+
report_infer_output_style.error(
|
1769
|
+
"LLM returned empty string for inferred output style."
|
1770
|
+
)
|
1462
1771
|
return None
|
1463
1772
|
except Exception as e:
|
1464
|
-
report_infer_output_style.error(
|
1773
|
+
report_infer_output_style.error(
|
1774
|
+
f"Error during output style inference: {e}"
|
1775
|
+
)
|
1465
1776
|
return None
|