opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +15 -26
- opik_optimizer/base_optimizer.py +28 -44
- opik_optimizer/data/hotpot-500.json +501 -1001
- opik_optimizer/datasets/__init__.py +6 -7
- opik_optimizer/datasets/hotpot_qa.py +2 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
- opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
- opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
- opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
- opik_optimizer/mipro_optimizer/__init__.py +1 -1
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
- opik_optimizer/mipro_optimizer/utils.py +1 -23
- opik_optimizer/optimization_config/chat_prompt.py +106 -0
- opik_optimizer/optimization_config/configs.py +2 -21
- opik_optimizer/optimization_config/mappers.py +1 -1
- opik_optimizer/optimization_result.py +57 -85
- opik_optimizer/reporting_utils.py +180 -0
- opik_optimizer/task_evaluator.py +41 -26
- opik_optimizer/utils.py +187 -3
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
- opik_optimizer-0.9.0.dist-info/RECORD +48 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
- opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
- opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
- opik_optimizer/integrations/__init__.py +0 -0
- opik_optimizer/meta_prompt_optimizer.py +0 -1151
- opik_optimizer-0.8.0.dist-info/RECORD +0 -45
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,31 @@
|
|
1
|
-
from typing import Optional, Union, List, Dict, Any, Tuple
|
2
|
-
import opik
|
3
|
-
import logging
|
4
|
-
import random
|
5
1
|
import json
|
6
|
-
|
2
|
+
import logging
|
7
3
|
import os
|
8
|
-
import
|
4
|
+
import random
|
5
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, cast
|
6
|
+
|
9
7
|
import Levenshtein
|
8
|
+
import litellm
|
10
9
|
import numpy as np
|
10
|
+
import opik
|
11
11
|
|
12
|
-
|
13
|
-
from
|
14
|
-
from
|
15
|
-
from
|
16
|
-
from opik_optimizer.optimization_config import mappers
|
17
|
-
from opik.api_objects import opik_client
|
18
|
-
from opik.environment import get_tqdm_for_current_environment
|
19
|
-
from opik_optimizer import _throttle
|
20
|
-
import litellm
|
12
|
+
# DEAP imports
|
13
|
+
from deap import base, tools
|
14
|
+
from deap import creator as _creator
|
15
|
+
from litellm import exceptions as litellm_exceptions
|
21
16
|
from litellm.caching import Cache
|
17
|
+
from litellm.types.caching import LiteLLMCacheType
|
18
|
+
from opik.api_objects import opik_client, optimization
|
19
|
+
from opik.environment import get_tqdm_for_current_environment
|
22
20
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
23
21
|
|
24
|
-
|
25
|
-
from
|
22
|
+
from opik_optimizer import _throttle, task_evaluator
|
23
|
+
from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
|
24
|
+
from opik_optimizer.optimization_config import chat_prompt, mappers
|
25
|
+
from opik_optimizer.optimization_result import OptimizationResult
|
26
|
+
|
27
|
+
from .. import utils
|
28
|
+
from . import reporting
|
26
29
|
|
27
30
|
logger = logging.getLogger(__name__)
|
28
31
|
tqdm = get_tqdm_for_current_environment()
|
@@ -30,13 +33,25 @@ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
|
30
33
|
|
31
34
|
# Using disk cache for LLM calls
|
32
35
|
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
33
|
-
litellm.cache = Cache(type=
|
36
|
+
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
|
37
|
+
|
38
|
+
creator = cast(Any, _creator) # type: ignore[assignment]
|
34
39
|
|
35
40
|
class EvolutionaryOptimizer(BaseOptimizer):
|
36
41
|
"""
|
37
|
-
|
38
|
-
|
39
|
-
|
42
|
+
The Evolutionary Optimizer can be used to optimize prompts using a 4 stage genetic algorithm
|
43
|
+
approach:
|
44
|
+
|
45
|
+
1. Generate a set of candidate prompts based on variations of the best prompts (exploitation) as
|
46
|
+
well as completely new prompts (exploration)
|
47
|
+
2. Evaluate the candidate prompts
|
48
|
+
3. Select the best prompts
|
49
|
+
4. Repeat until convergence
|
50
|
+
|
51
|
+
This algorithm is best used if you have a first draft prompt and would like to find a better
|
52
|
+
prompt.
|
53
|
+
|
54
|
+
Note: This algorithm is time consuming and can be expensive to run.
|
40
55
|
"""
|
41
56
|
|
42
57
|
DEFAULT_POPULATION_SIZE = 30
|
@@ -78,7 +93,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
78
93
|
def __init__(
|
79
94
|
self,
|
80
95
|
model: str,
|
81
|
-
project_name:
|
96
|
+
project_name: str = "Optimization",
|
82
97
|
population_size: int = DEFAULT_POPULATION_SIZE,
|
83
98
|
num_generations: int = DEFAULT_NUM_GENERATIONS,
|
84
99
|
mutation_rate: float = DEFAULT_MUTATION_RATE,
|
@@ -95,9 +110,26 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
95
110
|
verbose: int = 1,
|
96
111
|
**model_kwargs,
|
97
112
|
):
|
98
|
-
|
99
|
-
|
100
|
-
|
113
|
+
"""
|
114
|
+
Args:
|
115
|
+
model: The model to use for evaluation
|
116
|
+
project_name: Optional project name for tracking
|
117
|
+
population_size: Number of prompts in the population
|
118
|
+
num_generations: Number of generations to run
|
119
|
+
mutation_rate: Mutation rate for genetic operations
|
120
|
+
crossover_rate: Crossover rate for genetic operations
|
121
|
+
tournament_size: Tournament size for selection
|
122
|
+
num_threads: Number of threads for parallel evaluation
|
123
|
+
elitism_size: Number of elitism prompts
|
124
|
+
adaptive_mutation: Whether to use adaptive mutation
|
125
|
+
enable_moo: Whether to enable multi-objective optimization - When enable optimizes for both the supplied metric and the length of the prompt
|
126
|
+
enable_llm_crossover: Whether to enable LLM crossover
|
127
|
+
seed: Random seed for reproducibility
|
128
|
+
output_style_guidance: Output style guidance for prompts
|
129
|
+
infer_output_style: Whether to infer output style
|
130
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
131
|
+
**model_kwargs: Additional model parameters
|
132
|
+
"""
|
101
133
|
# Initialize base class first
|
102
134
|
super().__init__(model=model, project_name=project_name, **model_kwargs)
|
103
135
|
self.population_size = population_size
|
@@ -122,12 +154,15 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
122
154
|
self._llm_cache = {}
|
123
155
|
self._current_population = []
|
124
156
|
self._generations_without_overall_improvement = 0
|
157
|
+
self._best_primary_score_history: list[float] = []
|
158
|
+
self._gens_since_pop_improvement: int = 0
|
159
|
+
self.verbose = verbose
|
125
160
|
|
126
161
|
if self.seed is not None:
|
127
162
|
random.seed(self.seed)
|
128
163
|
np.random.seed(self.seed)
|
129
164
|
logger.info(f"Global random seed set to: {self.seed}")
|
130
|
-
# Note: DEAP tools generally respect random.seed().
|
165
|
+
# Note: DEAP tools generally respect random.seed().
|
131
166
|
# TODO investigate if specific DEAP components require separate seeding
|
132
167
|
|
133
168
|
if self.enable_moo:
|
@@ -142,7 +177,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
142
177
|
if not hasattr(creator, "Individual") or getattr(creator.Individual, "fitness") != fitness_attr:
|
143
178
|
if hasattr(creator, "Individual"):
|
144
179
|
del creator.Individual
|
145
|
-
creator.create("Individual",
|
180
|
+
creator.create("Individual", list, fitness=fitness_attr)
|
146
181
|
|
147
182
|
self.toolbox = base.Toolbox()
|
148
183
|
self.toolbox.register("default_individual", lambda: creator.Individual("placeholder"))
|
@@ -206,7 +241,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
206
241
|
return 0.0
|
207
242
|
|
208
243
|
# Calculate average Levenshtein distance between all pairs
|
209
|
-
total_distance = 0
|
244
|
+
total_distance = 0.0
|
210
245
|
count = 0
|
211
246
|
for i in range(len(self._current_population)):
|
212
247
|
for j in range(i + 1, len(self._current_population)):
|
@@ -221,18 +256,10 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
221
256
|
|
222
257
|
return total_distance / count if count > 0 else 0.0
|
223
258
|
|
224
|
-
def _deap_crossover(
|
225
|
-
self,
|
226
|
-
ind1: "creator.Individual",
|
227
|
-
ind2: "creator.Individual"
|
228
|
-
) -> Tuple["creator.Individual", "creator.Individual"]:
|
229
|
-
"""Enhanced crossover operation that preserves semantic meaning.
|
230
|
-
Attempts chunk-level crossover first, then falls back to word-level.
|
231
|
-
"""
|
232
|
-
str1_orig, str2_orig = str(ind1), str(ind2)
|
233
259
|
|
234
|
-
|
235
|
-
|
260
|
+
def _deap_crossover_chunking_strategy(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
|
261
|
+
chunks1 = [chunk.strip() for chunk in messages_1_str.split('.') if chunk.strip()]
|
262
|
+
chunks2 = [chunk.strip() for chunk in messages_2_str.split('.') if chunk.strip()]
|
236
263
|
|
237
264
|
# Try chunk-level crossover if both parents have at least 2 chunks
|
238
265
|
if len(chunks1) >= 2 and len(chunks2) >= 2:
|
@@ -247,35 +274,73 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
247
274
|
child1_str = '. '.join(child1_chunks) + ('.' if child1_chunks else '')
|
248
275
|
child2_str = '. '.join(child2_chunks) + ('.' if child2_chunks else '')
|
249
276
|
|
250
|
-
return
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
277
|
+
return child1_str, child2_str
|
278
|
+
else:
|
279
|
+
raise ValueError("Not enough chunks in either prompt for chunk-level crossover")
|
280
|
+
|
281
|
+
def _deap_crossover_word_level(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
|
282
|
+
words1 = messages_1_str.split()
|
283
|
+
words2 = messages_2_str.split()
|
255
284
|
|
256
285
|
# If either prompt is empty (no words), return parents
|
257
286
|
if not words1 or not words2:
|
258
|
-
return
|
287
|
+
return messages_1_str, messages_2_str
|
259
288
|
|
260
289
|
min_word_len = min(len(words1), len(words2))
|
261
290
|
# Need at least 2 words in the shorter prompt for a valid crossover point
|
262
291
|
if min_word_len < 2:
|
263
|
-
return
|
292
|
+
return messages_1_str, messages_2_str
|
264
293
|
|
265
294
|
# Crossover point for words: 1 to min_word_len - 1
|
266
295
|
point = random.randint(1, min_word_len - 1)
|
267
296
|
child1_words = words1[:point] + words2[point:]
|
268
297
|
child2_words = words2[:point] + words1[point:]
|
269
298
|
|
270
|
-
return
|
299
|
+
return ' '.join(child1_words), ' '.join(child2_words)
|
300
|
+
|
301
|
+
def _deap_crossover(
|
302
|
+
self,
|
303
|
+
ind1: "creator.Individual",
|
304
|
+
ind2: "creator.Individual"
|
305
|
+
) -> Tuple["creator.Individual", "creator.Individual"]:
|
306
|
+
"""Enhanced crossover operation that preserves semantic meaning.
|
307
|
+
Attempts chunk-level crossover first, then falls back to word-level.
|
308
|
+
"""
|
309
|
+
reporting.display_message(" Recombining prompts by mixing and matching words and sentences.", verbose=self.verbose)
|
310
|
+
messages_1_orig: List[Dict[Literal["role", "content"], str]] = ind1
|
311
|
+
messages_2_orig: List[Dict[Literal["role", "content"], str]] = ind2
|
312
|
+
|
313
|
+
for i, message_1 in enumerate(messages_1_orig):
|
314
|
+
role: str = message_1['role']
|
315
|
+
message_1_str: str = message_1['content']
|
316
|
+
|
317
|
+
# We check that the second message has enough AI messages and the correct role
|
318
|
+
if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]['role'] == role):
|
319
|
+
message_2 = messages_2_orig[i]
|
320
|
+
message_2_str: str = message_2['content']
|
321
|
+
|
322
|
+
try:
|
323
|
+
child1_str, child2_str = self._deap_crossover_chunking_strategy(message_1_str, message_2_str)
|
324
|
+
except ValueError:
|
325
|
+
child1_str, child2_str = self._deap_crossover_word_level(message_1_str, message_2_str)
|
326
|
+
|
327
|
+
# Update the message content
|
328
|
+
messages_1_orig[i]['content'] = child1_str
|
329
|
+
messages_2_orig[i]['content'] = child2_str
|
330
|
+
else:
|
331
|
+
# We don't perform any crossover if there are not enough messages or the roles
|
332
|
+
# don't match
|
333
|
+
pass
|
334
|
+
|
335
|
+
return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
|
271
336
|
|
272
337
|
def _deap_mutation(
|
273
338
|
self,
|
274
339
|
individual: "creator.Individual",
|
275
|
-
|
276
|
-
) ->
|
277
|
-
"""Enhanced mutation operation with multiple strategies.
|
278
|
-
prompt =
|
340
|
+
initial_prompt: chat_prompt.ChatPrompt
|
341
|
+
) -> "creator.Individual":
|
342
|
+
"""Enhanced mutation operation with multiple strategies."""
|
343
|
+
prompt = chat_prompt.ChatPrompt(messages=individual)
|
279
344
|
|
280
345
|
# Choose mutation strategy based on current diversity
|
281
346
|
diversity = self._calculate_population_diversity()
|
@@ -294,23 +359,29 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
294
359
|
|
295
360
|
if mutation_choice > structural_threshold:
|
296
361
|
# This corresponds to the original 'else' (word_level_mutation)
|
297
|
-
|
362
|
+
mutated_prompt = self._word_level_mutation_prompt(prompt)
|
363
|
+
reporting.display_success(" Mutation successful, prompt has been edited by randomizing words (word-level mutation).", verbose=self.verbose)
|
364
|
+
return creator.Individual(mutated_prompt.formatted_messages)
|
298
365
|
elif mutation_choice > semantic_threshold:
|
299
366
|
# This corresponds to the original 'elif' (structural_mutation)
|
300
|
-
|
367
|
+
mutated_prompt = self._structural_mutation(prompt)
|
368
|
+
reporting.display_success(" Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).", verbose=self.verbose)
|
369
|
+
return creator.Individual(mutated_prompt.formatted_messages)
|
301
370
|
else:
|
302
371
|
# This corresponds to the original 'if' (semantic_mutation)
|
303
|
-
|
372
|
+
mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
|
373
|
+
reporting.display_success(" Mutation successful, prompt has been edited using an LLM (semantic mutation).", verbose=self.verbose)
|
374
|
+
return creator.Individual(mutated_prompt.formatted_messages)
|
304
375
|
|
305
376
|
def _semantic_mutation(
|
306
377
|
self,
|
307
|
-
prompt:
|
308
|
-
|
309
|
-
) ->
|
378
|
+
prompt: chat_prompt.ChatPrompt,
|
379
|
+
initial_prompt: chat_prompt.ChatPrompt
|
380
|
+
) -> chat_prompt.ChatPrompt:
|
310
381
|
"""Enhanced semantic mutation with multiple strategies."""
|
311
382
|
current_output_style_guidance = self.output_style_guidance
|
312
383
|
if random.random() < 0.1:
|
313
|
-
return self._radical_innovation_mutation(prompt,
|
384
|
+
return self._radical_innovation_mutation(prompt, initial_prompt)
|
314
385
|
|
315
386
|
try:
|
316
387
|
strategy = random.choice([
|
@@ -327,58 +398,79 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
327
398
|
}
|
328
399
|
|
329
400
|
user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
|
330
|
-
Task context: {self._get_task_description_for_llm(
|
401
|
+
Task context: {self._get_task_description_for_llm(initial_prompt)}
|
331
402
|
Desired output style from target LLM: '{current_output_style_guidance}'
|
332
403
|
Instruction for this modification: {strategy_prompts[strategy]}.
|
333
|
-
Return only the modified prompt
|
404
|
+
Return only the modified prompt message list, nothing else. Make sure to return a valid JSON object.
|
334
405
|
"""
|
335
406
|
response = self._call_model(
|
336
|
-
|
337
|
-
|
407
|
+
messages=[
|
408
|
+
{"role": "system", "content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided."},
|
409
|
+
{"role": "user", "content": user_prompt_for_semantic_mutation}
|
410
|
+
],
|
338
411
|
is_reasoning=True
|
339
412
|
)
|
340
|
-
|
413
|
+
|
414
|
+
return chat_prompt.ChatPrompt(messages=utils.json_to_dict(response.strip()))
|
341
415
|
except Exception as e:
|
342
|
-
|
343
|
-
return
|
416
|
+
reporting.display_error(f" Error in semantic mutation, this is usually a parsing error: {e}", verbose=self.verbose)
|
417
|
+
return prompt
|
344
418
|
|
345
419
|
def _structural_mutation(
|
346
420
|
self,
|
347
|
-
prompt:
|
348
|
-
) ->
|
421
|
+
prompt: chat_prompt.ChatPrompt
|
422
|
+
) -> chat_prompt.ChatPrompt:
|
349
423
|
"""Perform structural mutation (reordering, combining, splitting)."""
|
350
|
-
|
351
|
-
if len(sentences) <= 1:
|
352
|
-
return self._word_level_mutation(prompt)
|
353
|
-
|
354
|
-
mutation_type = random.random()
|
355
|
-
if mutation_type < 0.3:
|
356
|
-
# Reorder sentences
|
357
|
-
random.shuffle(sentences)
|
358
|
-
return creator.Individual('. '.join(sentences) + '.'),
|
359
|
-
elif mutation_type < 0.6:
|
360
|
-
# Combine adjacent sentences
|
361
|
-
if len(sentences) >= 2:
|
362
|
-
idx = random.randint(0, len(sentences) - 2)
|
363
|
-
combined = sentences[idx] + ' and ' + sentences[idx + 1]
|
364
|
-
sentences[idx:idx+2] = [combined]
|
365
|
-
return creator.Individual('. '.join(sentences) + '.'),
|
366
|
-
else:
|
367
|
-
# Split a sentence
|
368
|
-
idx = random.randint(0, len(sentences) - 1)
|
369
|
-
words = sentences[idx].split()
|
370
|
-
if len(words) > 3:
|
371
|
-
split_point = random.randint(2, len(words) - 2)
|
372
|
-
sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
|
373
|
-
return creator.Individual('. '.join(sentences) + '.'),
|
374
|
-
|
375
|
-
return creator.Individual(prompt),
|
424
|
+
mutated_messages: List[Dict[Literal["role", "content"], str]] = []
|
376
425
|
|
377
|
-
|
426
|
+
for message in prompt.formatted_messages:
|
427
|
+
content = message["content"]
|
428
|
+
role = message["role"]
|
429
|
+
|
430
|
+
sentences = [s.strip() for s in content.split('.') if s.strip()]
|
431
|
+
if len(sentences) <= 1:
|
432
|
+
mutated_messages.append({"role": role, "content": self._word_level_mutation(content)})
|
433
|
+
continue
|
434
|
+
|
435
|
+
mutation_type = random.random()
|
436
|
+
if mutation_type < 0.3:
|
437
|
+
# Reorder sentences
|
438
|
+
random.shuffle(sentences)
|
439
|
+
mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
|
440
|
+
continue
|
441
|
+
elif mutation_type < 0.6:
|
442
|
+
# Combine adjacent sentences
|
443
|
+
if len(sentences) >= 2:
|
444
|
+
idx = random.randint(0, len(sentences) - 2)
|
445
|
+
combined = sentences[idx] + ' and ' + sentences[idx + 1]
|
446
|
+
sentences[idx:idx+2] = [combined]
|
447
|
+
mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
|
448
|
+
continue
|
449
|
+
else:
|
450
|
+
# Split a sentence
|
451
|
+
idx = random.randint(0, len(sentences) - 1)
|
452
|
+
words = sentences[idx].split()
|
453
|
+
if len(words) > 3:
|
454
|
+
split_point = random.randint(2, len(words) - 2)
|
455
|
+
sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
|
456
|
+
mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
|
457
|
+
continue
|
458
|
+
else:
|
459
|
+
mutated_messages.append({"role": role, "content": content})
|
460
|
+
|
461
|
+
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
462
|
+
|
463
|
+
def _word_level_mutation_prompt(self, prompt: chat_prompt.ChatPrompt) -> chat_prompt.ChatPrompt:
|
464
|
+
mutated_messages: List[Dict[Literal['role', 'content'], str]] = []
|
465
|
+
for message in prompt.formatted_messages:
|
466
|
+
mutated_messages.append({"role": message["role"], "content": self._word_level_mutation(message["content"])})
|
467
|
+
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
468
|
+
|
469
|
+
def _word_level_mutation(self, msg_content: str) -> str:
|
378
470
|
"""Perform word-level mutation."""
|
379
|
-
words =
|
471
|
+
words = msg_content.split()
|
380
472
|
if len(words) <= 1:
|
381
|
-
return
|
473
|
+
return msg_content
|
382
474
|
|
383
475
|
mutation_type = random.random()
|
384
476
|
if mutation_type < 0.3:
|
@@ -395,7 +487,7 @@ Return only the modified prompt string, nothing else.
|
|
395
487
|
idx = random.randint(0, len(words) - 1)
|
396
488
|
words[idx] = self._modify_phrase(words[idx])
|
397
489
|
|
398
|
-
return
|
490
|
+
return ' '.join(words)
|
399
491
|
|
400
492
|
def _get_synonym(
|
401
493
|
self,
|
@@ -404,8 +496,10 @@ Return only the modified prompt string, nothing else.
|
|
404
496
|
"""Get a synonym for a word using LLM."""
|
405
497
|
try:
|
406
498
|
response = self._call_model(
|
407
|
-
|
408
|
-
|
499
|
+
messages=[
|
500
|
+
{"role": "system", "content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text."},
|
501
|
+
{"role": "user", "content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else."}
|
502
|
+
],
|
409
503
|
is_reasoning=True
|
410
504
|
)
|
411
505
|
return response.strip()
|
@@ -420,8 +514,10 @@ Return only the modified prompt string, nothing else.
|
|
420
514
|
"""Modify a phrase while preserving meaning using LLM."""
|
421
515
|
try:
|
422
516
|
response = self._call_model(
|
423
|
-
|
424
|
-
|
517
|
+
messages=[
|
518
|
+
{"role": "system", "content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text."},
|
519
|
+
{"role": "user", "content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else."}
|
520
|
+
],
|
425
521
|
is_reasoning=True
|
426
522
|
)
|
427
523
|
return response.strip()
|
@@ -431,12 +527,12 @@ Return only the modified prompt string, nothing else.
|
|
431
527
|
|
432
528
|
def _radical_innovation_mutation(
|
433
529
|
self,
|
434
|
-
|
435
|
-
|
436
|
-
) ->
|
530
|
+
prompt: chat_prompt.ChatPrompt,
|
531
|
+
initial_prompt: chat_prompt.ChatPrompt
|
532
|
+
) -> chat_prompt.ChatPrompt:
|
437
533
|
"""Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
|
438
|
-
logger.debug(f"Attempting radical innovation for prompt: {
|
439
|
-
task_desc_for_llm = self._get_task_description_for_llm(
|
534
|
+
logger.debug(f"Attempting radical innovation for prompt: {json.dumps(prompt.formatted_messages)[:70]}...")
|
535
|
+
task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
|
440
536
|
current_output_style_guidance = self.output_style_guidance
|
441
537
|
|
442
538
|
user_prompt_for_radical_innovation = f"""Task Context:
|
@@ -444,225 +540,307 @@ Return only the modified prompt string, nothing else.
|
|
444
540
|
Desired output style from target LLM: '{current_output_style_guidance}'
|
445
541
|
|
446
542
|
Existing Prompt (which may be underperforming):
|
447
|
-
'''{
|
543
|
+
'''{prompt.formatted_messages}'''
|
448
544
|
|
449
545
|
Please generate a new, significantly improved, and potentially very different prompt for this task.
|
450
546
|
Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
|
451
|
-
Return only the new prompt
|
547
|
+
Return only the new prompt list object.
|
452
548
|
"""
|
453
549
|
try:
|
454
550
|
new_prompt_str = self._call_model(
|
455
|
-
|
456
|
-
|
551
|
+
messages=[
|
552
|
+
{"role": "system", "content": self._get_radical_innovation_system_prompt()},
|
553
|
+
{"role": "user", "content": user_prompt_for_radical_innovation}
|
554
|
+
],
|
457
555
|
is_reasoning=True
|
458
556
|
)
|
459
|
-
logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {
|
460
|
-
return
|
557
|
+
logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.formatted_messages)[:70]}...")
|
558
|
+
return chat_prompt.ChatPrompt(messages=json.loads(new_prompt_str))
|
461
559
|
except Exception as e:
|
462
|
-
logger.warning(f"Radical innovation mutation failed for prompt '{
|
463
|
-
return
|
560
|
+
logger.warning(f"Radical innovation mutation failed for prompt '{json.dumps(prompt.formatted_messages)[:50]}...': {e}. Returning original.")
|
561
|
+
return prompt
|
464
562
|
|
465
563
|
def _initialize_population(
|
466
564
|
self,
|
467
|
-
|
468
|
-
|
469
|
-
) -> List[str]:
|
565
|
+
prompt: chat_prompt.ChatPrompt
|
566
|
+
) -> List[chat_prompt.ChatPrompt]:
|
470
567
|
"""Initialize the population with diverse variations of the initial prompt,
|
471
568
|
including some 'fresh start' prompts based purely on task description.
|
472
569
|
All generated prompts should aim to elicit answers matching self.output_style_guidance.
|
473
570
|
"""
|
474
|
-
|
475
|
-
|
476
|
-
|
571
|
+
with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
|
572
|
+
init_pop_report.start(self.population_size)
|
573
|
+
|
574
|
+
population = [prompt]
|
575
|
+
if self.population_size <= 1:
|
576
|
+
return population
|
577
|
+
|
578
|
+
num_to_generate_total = self.population_size - 1
|
579
|
+
num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
|
580
|
+
num_variations_on_initial = num_to_generate_total - num_fresh_starts
|
581
|
+
|
582
|
+
task_desc_for_llm = self._get_task_description_for_llm(prompt)
|
583
|
+
current_output_style_guidance = self.output_style_guidance
|
584
|
+
|
585
|
+
# Generate "fresh start" prompts if the initial prompt is not performing well
|
586
|
+
# Cold start prompts are generated from the task description
|
587
|
+
if num_fresh_starts > 0:
|
588
|
+
init_pop_report.start_fresh_prompts(num_fresh_starts)
|
589
|
+
fresh_start_user_prompt = f"""Here is a description of a task:
|
590
|
+
{task_desc_for_llm}
|
591
|
+
|
592
|
+
The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
|
593
|
+
|
594
|
+
Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
|
595
|
+
Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
|
596
|
+
|
597
|
+
Example of valid response: [
|
598
|
+
["role": "<role>", "content": "<Prompt targeting specified style.>"],
|
599
|
+
["role": "<role>", "content": "<Another prompt designed for the output style.>"]
|
600
|
+
]
|
477
601
|
|
478
|
-
|
479
|
-
|
480
|
-
|
602
|
+
Your response MUST be a valid JSON list of AI messages. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
|
603
|
+
|
604
|
+
"""
|
605
|
+
try:
|
606
|
+
response_content = self._call_model(
|
607
|
+
messages=[
|
608
|
+
{"role": "system", "content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings."},
|
609
|
+
{"role": "user", "content": fresh_start_user_prompt}
|
610
|
+
],
|
611
|
+
is_reasoning=True
|
612
|
+
)
|
613
|
+
|
614
|
+
logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
|
615
|
+
|
616
|
+
fresh_prompts = utils.json_to_dict(response_content)
|
617
|
+
if isinstance(fresh_prompts, list):
|
618
|
+
if all(isinstance(p, dict) for p in fresh_prompts) and all(p.get("role") is not None for p in fresh_prompts):
|
619
|
+
population.append(chat_prompt.ChatPrompt(messages=fresh_prompts))
|
620
|
+
init_pop_report.success_fresh_prompts(1)
|
621
|
+
elif all(isinstance(p, list) for p in fresh_prompts):
|
622
|
+
population.extend([chat_prompt.ChatPrompt(messages=p) for p in fresh_prompts[:num_fresh_starts]])
|
623
|
+
init_pop_report.success_fresh_prompts(len(fresh_prompts[:num_fresh_starts]))
|
624
|
+
else:
|
625
|
+
init_pop_report.failed_fresh_prompts(
|
626
|
+
num_fresh_starts,
|
627
|
+
f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts."
|
628
|
+
)
|
629
|
+
except json.JSONDecodeError as e_json:
|
630
|
+
init_pop_report.failed_fresh_prompts(
|
631
|
+
num_fresh_starts,
|
632
|
+
f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts."
|
633
|
+
)
|
634
|
+
except Exception as e:
|
635
|
+
init_pop_report.failed_fresh_prompts(
|
636
|
+
num_fresh_starts,
|
637
|
+
f"Error generating fresh start prompts: {e}. Skipping fresh start prompts."
|
638
|
+
)
|
639
|
+
|
640
|
+
# Generate variations on the initial prompt for the remaining slots
|
641
|
+
# TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
|
642
|
+
if num_variations_on_initial > 0:
|
643
|
+
init_pop_report.start_variations(num_variations_on_initial)
|
644
|
+
|
645
|
+
# TODO: We need to split this into batches as the model will not return enough tokens
|
646
|
+
# to generate all the candidates
|
647
|
+
user_prompt_for_variation = f"""Initial prompt:
|
648
|
+
'''{prompt.formatted_messages}'''
|
649
|
+
|
650
|
+
Task context:
|
651
|
+
{task_desc_for_llm}
|
652
|
+
Desired output style from target LLM: '{current_output_style_guidance}'
|
653
|
+
|
654
|
+
Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
|
655
|
+
All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
|
656
|
+
For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
|
657
|
+
|
658
|
+
Return a JSON array of prompts with the following structure:
|
659
|
+
{{
|
660
|
+
"prompts": [
|
661
|
+
{{
|
662
|
+
"prompt": [{{"role": "<role>", "content": "<content>"}}],
|
663
|
+
"strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
|
664
|
+
}}
|
665
|
+
// ... more prompts if num_variations_on_initial > 1
|
666
|
+
]
|
667
|
+
}}
|
668
|
+
Ensure a good mix of variations, all targeting the specified output style from the end LLM.
|
669
|
+
|
670
|
+
Return a valid JSON object that is correctly escaped. Return nothing else, d`o not include any additional text or Markdown formatting.
|
671
|
+
"""
|
672
|
+
try:
|
673
|
+
response_content_variations = self._call_model(
|
674
|
+
messages=[
|
675
|
+
{"role": "system", "content": self._get_reasoning_system_prompt_for_variation()},
|
676
|
+
{"role": "user", "content": user_prompt_for_variation}
|
677
|
+
],
|
678
|
+
is_reasoning=True
|
679
|
+
)
|
680
|
+
logger.debug(f"Raw response for population variations: {response_content_variations}")
|
681
|
+
json_response_variations = json.loads(response_content_variations)
|
682
|
+
generated_prompts_variations = [p["prompt"] for p in json_response_variations.get("prompts", []) if isinstance(p, dict) and "prompt" in p]
|
683
|
+
|
684
|
+
if generated_prompts_variations:
|
685
|
+
init_pop_report.success_variations(len(generated_prompts_variations[:num_variations_on_initial]))
|
686
|
+
population.extend([chat_prompt.ChatPrompt(messages=p) for p in generated_prompts_variations[:num_variations_on_initial]])
|
687
|
+
else:
|
688
|
+
init_pop_report.failed_variations(num_variations_on_initial, "Could not parse 'prompts' list for variations. Skipping variations.")
|
689
|
+
except Exception as e:
|
690
|
+
init_pop_report.failed_variations(num_variations_on_initial, f"Error calling LLM for initial population variations: {e}")
|
691
|
+
|
692
|
+
# Ensure population is of the required size using unique prompts
|
693
|
+
# TODO Test with levenshtein distance
|
694
|
+
final_population_set: Set[str] = set()
|
695
|
+
final_population_list: List[chat_prompt.ChatPrompt] = []
|
696
|
+
for p in population:
|
697
|
+
if json.dumps(p.formatted_messages) not in final_population_set:
|
698
|
+
final_population_set.add(json.dumps(p.formatted_messages))
|
699
|
+
final_population_list.append(p)
|
700
|
+
|
701
|
+
init_pop_report.end(final_population_list)
|
702
|
+
# Return exactly population_size prompts if possible, or fewer if generation failed badly.
|
703
|
+
return final_population_list[:self.population_size]
|
481
704
|
|
482
|
-
task_desc_for_llm = self._get_task_description_for_llm(task_config)
|
483
|
-
current_output_style_guidance = self.output_style_guidance
|
484
705
|
|
485
|
-
|
486
|
-
|
487
|
-
if
|
488
|
-
|
489
|
-
|
490
|
-
|
706
|
+
def _should_restart_population(self, curr_best: float) -> bool:
|
707
|
+
"""
|
708
|
+
Update internal counters and decide if we should trigger
|
709
|
+
a population restart based on lack of improvement.
|
710
|
+
"""
|
711
|
+
if self._best_primary_score_history:
|
712
|
+
threshold = self._best_primary_score_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD)
|
713
|
+
if curr_best < threshold:
|
714
|
+
self._gens_since_pop_improvement += 1
|
715
|
+
else:
|
716
|
+
self._gens_since_pop_improvement = 0
|
717
|
+
self._best_primary_score_history.append(curr_best)
|
718
|
+
return self._gens_since_pop_improvement >= self.DEFAULT_RESTART_GENERATIONS
|
491
719
|
|
492
|
-
|
720
|
+
def _restart_population(
|
721
|
+
self,
|
722
|
+
hof: tools.HallOfFame,
|
723
|
+
population: list["creator.Individual"],
|
724
|
+
best_prompt_so_far: chat_prompt.ChatPrompt,
|
725
|
+
) -> list["creator.Individual"]:
|
726
|
+
"""Return a fresh, evaluated population seeded by elites."""
|
727
|
+
if self.enable_moo:
|
728
|
+
elites = list(hof)
|
729
|
+
else:
|
730
|
+
elites = tools.selBest(population, self.elitism_size)
|
493
731
|
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
"""
|
499
|
-
try:
|
500
|
-
response_content = self._call_model(
|
501
|
-
prompt=fresh_start_user_prompt,
|
502
|
-
system_prompt=f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
|
503
|
-
is_reasoning=True
|
504
|
-
)
|
505
|
-
logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
|
506
|
-
|
507
|
-
cleaned_response_content = response_content.strip()
|
508
|
-
if cleaned_response_content.startswith("```json"):
|
509
|
-
cleaned_response_content = cleaned_response_content[7:]
|
510
|
-
if cleaned_response_content.endswith("```"):
|
511
|
-
cleaned_response_content = cleaned_response_content[:-3]
|
512
|
-
elif cleaned_response_content.startswith("```"):
|
513
|
-
cleaned_response_content = cleaned_response_content[3:]
|
514
|
-
if cleaned_response_content.endswith("```"):
|
515
|
-
cleaned_response_content = cleaned_response_content[:-3]
|
516
|
-
cleaned_response_content = cleaned_response_content.strip()
|
517
|
-
|
518
|
-
fresh_prompts = json.loads(cleaned_response_content)
|
519
|
-
if isinstance(fresh_prompts, list) and all(isinstance(p, str) for p in fresh_prompts) and fresh_prompts:
|
520
|
-
population.extend(fresh_prompts[:num_fresh_starts])
|
521
|
-
logger.info(f"Generated {len(fresh_prompts[:num_fresh_starts])} fresh prompts from LLM.")
|
522
|
-
else:
|
523
|
-
logger.warning(f"LLM response for fresh starts was not a valid list of strings or was empty: {cleaned_response_content}. Using fallbacks for fresh starts.")
|
524
|
-
population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
|
525
|
-
except json.JSONDecodeError as e_json:
|
526
|
-
logger.warning(f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response (after cleaning): '{cleaned_response_content if 'cleaned_response_content' in locals() else response_content}'. Using fallbacks for fresh starts.")
|
527
|
-
population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
|
528
|
-
except Exception as e:
|
529
|
-
logger.warning(f"Error generating fresh start prompts: {e}. Using fallbacks for fresh starts.")
|
530
|
-
population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
|
732
|
+
seed_prompt = (
|
733
|
+
chat_prompt.ChatPrompt(messages=max(elites, key=lambda x: x.fitness.values[0]))
|
734
|
+
if elites else best_prompt_so_far
|
735
|
+
)
|
531
736
|
|
532
|
-
|
533
|
-
|
534
|
-
if num_variations_on_initial > 0:
|
535
|
-
logger.info(f"Generating {num_variations_on_initial} variations of the initial prompt (aiming for style: '{current_output_style_guidance[:30]}...')...")
|
536
|
-
user_prompt_for_variation = f"""Initial prompt:
|
537
|
-
'''{initial_prompt}'''
|
737
|
+
prompt_variants = self._initialize_population(seed_prompt)
|
738
|
+
new_pop = [creator.Individual(p.formatted_messages) for p in prompt_variants]
|
538
739
|
|
539
|
-
|
540
|
-
|
541
|
-
Desired output style from target LLM: '{current_output_style_guidance}'
|
740
|
+
for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
|
741
|
+
ind.fitness.values = fit
|
542
742
|
|
543
|
-
|
544
|
-
|
545
|
-
For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
|
743
|
+
self._gens_since_pop_improvement = 0
|
744
|
+
return new_pop
|
546
745
|
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
]
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
#
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
if fallback_prompt not in final_population_set:
|
591
|
-
final_population_list.append(fallback_prompt)
|
592
|
-
final_population_set.add(fallback_prompt)
|
593
|
-
else:
|
594
|
-
# Safeguard if initial_prompt itself is causing issues with uniqueness
|
595
|
-
fallback_prompt = f"Fallback prompt variation {random.randint(1000,9999)}"
|
596
|
-
if fallback_prompt not in final_population_set:
|
597
|
-
final_population_list.append(fallback_prompt)
|
598
|
-
final_population_set.add(fallback_prompt)
|
599
|
-
# Avoid infinite loop in extreme edge case
|
600
|
-
else: break
|
601
|
-
|
602
|
-
logger.info(f"Initialized population with {len(final_population_list)} prompts.")
|
603
|
-
# Return exactly population_size prompts if possible, or fewer if generation failed badly.
|
604
|
-
return final_population_list[:self.population_size]
|
605
|
-
|
606
|
-
def _generate_diverse_variation(
|
607
|
-
self,
|
608
|
-
base_prompt: str,
|
609
|
-
seen_prompts: set
|
610
|
-
) -> str:
|
611
|
-
"""Generate a new variation that's different from existing ones."""
|
612
|
-
max_attempts = 5
|
613
|
-
for _ in range(max_attempts):
|
614
|
-
# Try different mutation strategies
|
615
|
-
mutation_choice = random.random()
|
616
|
-
if mutation_choice < 0.3:
|
617
|
-
new_prompt = self._semantic_mutation(base_prompt)[0]
|
618
|
-
elif mutation_choice < 0.6:
|
619
|
-
new_prompt = self._structural_mutation(base_prompt)[0]
|
620
|
-
else:
|
621
|
-
new_prompt = self._word_level_mutation(base_prompt)[0]
|
622
|
-
|
623
|
-
# Check if this variation is sufficiently different
|
624
|
-
is_diverse = True
|
625
|
-
for existing in seen_prompts:
|
626
|
-
if Levenshtein.distance(str(new_prompt), existing) / max(len(str(new_prompt)), len(existing)) < 0.3:
|
627
|
-
is_diverse = False
|
628
|
-
break
|
629
|
-
if is_diverse:
|
630
|
-
return str(new_prompt)
|
746
|
+
def _run_generation(
|
747
|
+
self,
|
748
|
+
generation_idx: int,
|
749
|
+
population: list["creator.Individual"],
|
750
|
+
prompt: chat_prompt.ChatPrompt,
|
751
|
+
hof: tools.HallOfFame,
|
752
|
+
report: Any,
|
753
|
+
best_primary_score_overall: float,
|
754
|
+
) -> tuple[list["creator.Individual"], int]:
|
755
|
+
"""Execute mating, mutation, evaluation and HoF update."""
|
756
|
+
best_gen_score = 0.0
|
757
|
+
|
758
|
+
# --- selection -------------------------------------------------
|
759
|
+
if self.enable_moo:
|
760
|
+
offspring = self.toolbox.select(population, self.population_size)
|
761
|
+
else:
|
762
|
+
elites = tools.selBest(population, self.elitism_size)
|
763
|
+
rest = self.toolbox.select(population, len(population) - self.elitism_size)
|
764
|
+
offspring = elites + rest
|
765
|
+
|
766
|
+
# --- crossover -------------------------------------------------
|
767
|
+
report.performing_crossover()
|
768
|
+
offspring = list(map(self.toolbox.clone, offspring))
|
769
|
+
for i in range(0, len(offspring), 2):
|
770
|
+
if i+1 < len(offspring):
|
771
|
+
c1, c2 = offspring[i], offspring[i+1]
|
772
|
+
if random.random() < self.crossover_rate:
|
773
|
+
c1_new, c2_new = self.toolbox.mate(c1, c2)
|
774
|
+
offspring[i], offspring[i+1] = c1_new, c2_new
|
775
|
+
del offspring[i].fitness.values, offspring[i+1].fitness.values
|
776
|
+
reporting.display_success(" Crossover successful, prompts have been combined and edited.\n│", verbose=self.verbose)
|
777
|
+
|
778
|
+
# --- mutation --------------------------------------------------
|
779
|
+
report.performing_mutation()
|
780
|
+
mut_rate = self._get_adaptive_mutation_rate()
|
781
|
+
n_mutations = 0
|
782
|
+
for i, ind in enumerate(offspring):
|
783
|
+
if random.random() < mut_rate:
|
784
|
+
new_ind = self.toolbox.mutate(ind, initial_prompt=prompt)
|
785
|
+
offspring[i] = new_ind
|
786
|
+
del offspring[i].fitness.values
|
787
|
+
n_mutations += 1
|
788
|
+
reporting.display_success(f" Mutation successful, {n_mutations} prompts have been edited.\n│", verbose=self.verbose)
|
631
789
|
|
632
|
-
#
|
633
|
-
|
790
|
+
# --- evaluation ------------------------------------------------
|
791
|
+
invalid = [ind for ind in offspring if not ind.fitness.valid]
|
792
|
+
report.performing_evaluation(len(invalid))
|
793
|
+
for ind_idx, ind in enumerate(invalid):
|
794
|
+
fit = self.toolbox.evaluate(ind)
|
795
|
+
ind.fitness.values = fit
|
796
|
+
best_gen_score = max(best_gen_score, fit[0])
|
634
797
|
|
635
|
-
|
636
|
-
self,
|
637
|
-
initial_prompt: str,
|
638
|
-
num_variations: int
|
639
|
-
) -> List[str]:
|
640
|
-
"""Generate fallback variations when LLM generation fails."""
|
641
|
-
variations = []
|
642
|
-
words = initial_prompt.split()
|
798
|
+
report.performed_evaluation(ind_idx, ind.fitness.values[0])
|
643
799
|
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
shuffled = words.copy()
|
648
|
-
random.shuffle(shuffled)
|
649
|
-
variations.append(' '.join(shuffled))
|
650
|
-
else:
|
651
|
-
# Add simple variations
|
652
|
-
variations.append(initial_prompt + f" #v{i}")
|
800
|
+
# --- update HoF & reporter ------------------------------------
|
801
|
+
hof.update(offspring)
|
802
|
+
reporting.end_gen(generation_idx, best_gen_score, best_primary_score_overall, verbose=self.verbose)
|
653
803
|
|
654
|
-
return
|
804
|
+
return offspring, len(invalid)
|
805
|
+
|
806
|
+
def _population_best_score(self, population: List["creator.Individual"]) -> float:
|
807
|
+
"""Return highest primary-objective score among *valid* individuals."""
|
808
|
+
valid_scores = [ind.fitness.values[0] for ind in population if ind.fitness.valid]
|
809
|
+
return max(valid_scores, default=0.0)
|
655
810
|
|
656
811
|
def optimize_prompt(
|
657
812
|
self,
|
658
|
-
|
659
|
-
|
660
|
-
|
813
|
+
prompt: chat_prompt.ChatPrompt,
|
814
|
+
dataset: opik.Dataset,
|
815
|
+
metric: Callable,
|
661
816
|
experiment_config: Optional[Dict] = None,
|
662
817
|
n_samples: Optional[int] = None,
|
663
818
|
auto_continue: bool = False,
|
664
819
|
**kwargs,
|
665
820
|
) -> OptimizationResult:
|
821
|
+
"""
|
822
|
+
Args:
|
823
|
+
prompt: The prompt to optimize
|
824
|
+
dataset: The dataset to use for evaluation
|
825
|
+
metric: Metric function to optimize with, should have the arguments `dataset_item` and `llm_output`
|
826
|
+
experiment_config: Optional experiment configuration
|
827
|
+
n_samples: Optional number of samples to use
|
828
|
+
auto_continue: Whether to automatically continue optimization
|
829
|
+
**kwargs: Additional keyword arguments
|
830
|
+
"""
|
831
|
+
reporting.display_header(self.__class__.__name__, verbose=self.verbose)
|
832
|
+
reporting.display_configuration(
|
833
|
+
prompt.formatted_messages,
|
834
|
+
{
|
835
|
+
"optimizer": f"{ 'DEAP MOO' if self.enable_moo else 'DEAP SO' } Evolutionary Optimization",
|
836
|
+
"population_size": self.population_size,
|
837
|
+
"generations": self.num_generations,
|
838
|
+
"mutation_rate": self.mutation_rate,
|
839
|
+
"crossover_rate": self.crossover_rate,
|
840
|
+
},
|
841
|
+
verbose=self.verbose
|
842
|
+
)
|
843
|
+
|
666
844
|
self.llm_call_counter = 0
|
667
845
|
self._history = []
|
668
846
|
self._current_optimization_id = None
|
@@ -672,145 +850,120 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
672
850
|
self._llm_cache.clear()
|
673
851
|
self._current_population = []
|
674
852
|
self._generations_without_overall_improvement = 0
|
675
|
-
|
676
|
-
# Determine final output_style_guidance
|
677
|
-
effective_output_style_guidance = self.output_style_guidance
|
678
|
-
if self.infer_output_style and \
|
679
|
-
(self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
|
680
|
-
# If user wants inference AND hasn't provided a specific custom guidance
|
681
|
-
inferred_style = self._infer_output_style_from_dataset(dataset, task_config)
|
682
|
-
if inferred_style:
|
683
|
-
effective_output_style_guidance = inferred_style
|
684
|
-
# Update self.output_style_guidance for this run so dynamic prompt methods use it
|
685
|
-
self.output_style_guidance = inferred_style
|
686
|
-
else:
|
687
|
-
logger.warning("Failed to infer output style, using default or user-provided guidance.")
|
688
853
|
|
689
|
-
#
|
690
|
-
# (It might have been None if user passed None and infer_output_style was False)
|
691
|
-
if self.output_style_guidance is None:
|
692
|
-
# Fallback if still None
|
693
|
-
self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
694
|
-
|
695
|
-
# The methods like get_reasoning_system_prompt_for_variation will now use the potentially updated self.output_style_guidance
|
696
|
-
log_prefix = "DEAP MOO" if self.enable_moo else "DEAP SO"
|
697
|
-
logger.info(f"Starting {log_prefix} Evolutionary Optimization for prompt: {task_config.instruction_prompt[:100]}...")
|
698
|
-
logger.info(f"Population: {self.population_size}, Generations: {self.num_generations}, Mutation: {self.mutation_rate}, Crossover: {self.crossover_rate}")
|
699
|
-
|
700
|
-
opik_dataset_obj: opik.Dataset
|
701
|
-
if isinstance(dataset, str):
|
702
|
-
opik_dataset_obj = self._opik_client.get_dataset(dataset)
|
703
|
-
else:
|
704
|
-
opik_dataset_obj = dataset
|
705
|
-
|
706
|
-
opik_optimization_run = None
|
707
|
-
try:
|
708
|
-
opik_optimization_run = self._opik_client.create_optimization(
|
709
|
-
dataset_name=opik_dataset_obj.name,
|
710
|
-
objective_name=metric_config.metric.name,
|
711
|
-
metadata={"optimizer": self.__class__.__name__},
|
712
|
-
)
|
713
|
-
self._current_optimization_id = opik_optimization_run.id
|
714
|
-
logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
|
715
|
-
except Exception as e:
|
716
|
-
logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
|
717
|
-
|
718
|
-
# Use of multi-objective fitness function or single-objective fitness function
|
854
|
+
# Step 0. Define fitness function
|
719
855
|
if self.enable_moo:
|
720
856
|
def _deap_evaluate_individual_fitness(
|
721
|
-
|
857
|
+
messages: List[Dict[str, str]]
|
722
858
|
) -> Tuple[float, float]:
|
723
|
-
primary_fitness_score = self.evaluate_prompt(
|
724
|
-
|
725
|
-
|
859
|
+
primary_fitness_score: float = self.evaluate_prompt(
|
860
|
+
prompt=chat_prompt.ChatPrompt(messages=messages),
|
861
|
+
dataset=dataset,
|
862
|
+
metric=metric,
|
863
|
+
n_samples=n_samples,
|
726
864
|
experiment_config=(experiment_config or {}).copy(),
|
727
|
-
optimization_id=self._current_optimization_id,
|
865
|
+
optimization_id=self._current_optimization_id,
|
866
|
+
verbose=0
|
728
867
|
)
|
729
|
-
prompt_length = float(len(str(
|
730
|
-
logger.debug(f"Evaluated MOO individual '{str(individual_prompt_str)[:50]}...' -> Primary Score: {primary_fitness_score:.4f}, Length: {prompt_length}")
|
868
|
+
prompt_length = float(len(str(json.dumps(messages))))
|
731
869
|
return (primary_fitness_score, prompt_length)
|
732
870
|
else:
|
733
871
|
# Single-objective
|
734
872
|
def _deap_evaluate_individual_fitness(
|
735
|
-
|
873
|
+
messages: List[Dict[str, str]]
|
736
874
|
) -> Tuple[float,]:
|
737
|
-
fitness_score = self.evaluate_prompt(
|
738
|
-
|
739
|
-
|
875
|
+
fitness_score: float = self.evaluate_prompt(
|
876
|
+
prompt=chat_prompt.ChatPrompt(messages=messages),
|
877
|
+
dataset=dataset,
|
878
|
+
metric=metric,
|
879
|
+
n_samples=n_samples,
|
740
880
|
experiment_config=(experiment_config or {}).copy(),
|
741
|
-
optimization_id=self._current_optimization_id,
|
881
|
+
optimization_id=self._current_optimization_id,
|
882
|
+
verbose=0
|
742
883
|
)
|
743
|
-
logger.debug(f"Evaluated SO individual '{str(individual_prompt_str)[:50]}...' -> Score: {fitness_score:.4f}")
|
744
884
|
return (fitness_score,)
|
745
|
-
|
746
|
-
# Register the fitness function with DEAP
|
747
885
|
self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
|
748
886
|
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
887
|
+
# Step 1. Start Opik optimization run
|
888
|
+
opik_optimization_run: Optional[optimization.Optimization] = None
|
889
|
+
try:
|
890
|
+
opik_optimization_run: optimization.Optimization = self._opik_client.create_optimization(
|
891
|
+
dataset_name=dataset.name,
|
892
|
+
objective_name=metric.__name__,
|
893
|
+
metadata={"optimizer": self.__class__.__name__},
|
894
|
+
)
|
895
|
+
self._current_optimization_id = opik_optimization_run.id
|
896
|
+
logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
|
897
|
+
except Exception as e:
|
898
|
+
logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
|
754
899
|
|
755
|
-
|
756
|
-
|
757
|
-
|
900
|
+
# Step 2. Compute the initial performance of the prompt
|
901
|
+
with reporting.baseline_performance(verbose=self.verbose) as report_baseline_performance:
|
902
|
+
initial_eval_result: Tuple[float, float] | Tuple[float, ] = _deap_evaluate_individual_fitness(prompt.formatted_messages)
|
903
|
+
initial_primary_score: float = initial_eval_result[0]
|
904
|
+
initial_length: float = initial_eval_result[1] if self.enable_moo else float(len(json.dumps(prompt.formatted_messages)))
|
905
|
+
|
906
|
+
best_primary_score_overall: float = initial_primary_score
|
907
|
+
best_prompt_overall = prompt
|
908
|
+
report_baseline_performance.set_score(initial_primary_score)
|
758
909
|
|
759
|
-
|
760
|
-
|
761
|
-
if self.
|
762
|
-
|
763
|
-
|
764
|
-
|
910
|
+
# Step 3. Define the output style guide
|
911
|
+
effective_output_style_guidance = self.output_style_guidance
|
912
|
+
if self.infer_output_style and \
|
913
|
+
(self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
|
914
|
+
# If user wants inference AND hasn't provided a specific custom guidance
|
915
|
+
inferred_style = self._infer_output_style_from_dataset(dataset, prompt)
|
916
|
+
if inferred_style:
|
917
|
+
effective_output_style_guidance = inferred_style
|
918
|
+
# Update self.output_style_guidance for this run so dynamic prompt methods use it
|
919
|
+
self.output_style_guidance = inferred_style
|
920
|
+
else:
|
921
|
+
logger.warning("Failed to infer output style, using default or user-provided guidance.")
|
765
922
|
|
766
|
-
#
|
923
|
+
# Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
|
924
|
+
# (It might have been None if user passed None and infer_output_style was False)
|
925
|
+
if self.output_style_guidance is None:
|
926
|
+
# Fallback if still None
|
927
|
+
self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
928
|
+
|
929
|
+
# Step 4. Initialize population
|
930
|
+
initial_prompts: List[chat_prompt.ChatPrompt] = self._initialize_population(
|
931
|
+
prompt=prompt
|
932
|
+
)
|
933
|
+
|
934
|
+
deap_population = [creator.Individual(p.formatted_messages) for p in initial_prompts]
|
935
|
+
deap_population = deap_population[:self.population_size]
|
936
|
+
|
937
|
+
# Step 5. Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
|
767
938
|
if self.enable_moo:
|
768
939
|
hof = tools.ParetoFront()
|
769
|
-
stats_primary = tools.Statistics(lambda ind: ind.fitness.values[0])
|
770
|
-
stats_length = tools.Statistics(lambda ind: ind.fitness.values[1])
|
771
|
-
stats_primary.register("avg_score", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
|
772
|
-
stats_primary.register("max_score", max)
|
773
|
-
stats_length.register("avg_len", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
|
774
|
-
stats_length.register("min_len", min)
|
775
|
-
mstats = tools.MultiStatistics(score=stats_primary, length=stats_length)
|
776
|
-
logbook_header_stats = mstats.fields
|
777
940
|
else:
|
778
941
|
# Single-objective
|
779
942
|
hof = tools.HallOfFame(self.DEFAULT_HALL_OF_FAME_SIZE)
|
780
|
-
stats = tools.Statistics(lambda ind: ind.fitness.values[0])
|
781
|
-
stats.register("avg", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
|
782
|
-
stats.register("std", lambda x: (sum((xi - (sum(x) / len(x) if len(x) > 0 else 0))**2 for xi in x) / len(x))**0.5 if len(x) > 1 else 0)
|
783
|
-
stats.register("min", min)
|
784
|
-
stats.register("max", max)
|
785
|
-
logbook_header_stats = stats.fields
|
786
|
-
|
787
|
-
logbook = tools.Logbook()
|
788
|
-
logbook.header = ["gen", "evals"] + logbook_header_stats
|
789
943
|
|
790
|
-
# Evaluate the initial population
|
791
|
-
|
792
|
-
|
793
|
-
|
944
|
+
# Step 6. Evaluate the initial population
|
945
|
+
with reporting.evaluate_initial_population(verbose=self.verbose) as report_initial_population:
|
946
|
+
fitnesses: List[float] = list(map(self.toolbox.evaluate, deap_population))
|
947
|
+
_best_score = max(best_primary_score_overall, max([x[0] for x in fitnesses]))
|
948
|
+
|
949
|
+
for i, ind, fit in zip(range(len(deap_population)), deap_population, fitnesses):
|
950
|
+
ind.fitness.values = fit
|
951
|
+
report_initial_population.set_score(i, fit[0], _best_score)
|
794
952
|
|
795
953
|
hof.update(deap_population)
|
796
|
-
|
797
|
-
record = record_stats.compile(deap_population) if record_stats else {}
|
798
|
-
logbook.record(gen=0, evals=len(deap_population), **record)
|
799
|
-
if self.verbose >= 1:
|
800
|
-
print(logbook.stream)
|
801
|
-
|
954
|
+
|
802
955
|
if hof and len(hof) > 0:
|
803
956
|
if self.enable_moo:
|
804
|
-
current_best_for_primary = max(hof, key=lambda ind: ind.fitness.values[0])
|
805
|
-
best_primary_score_overall = current_best_for_primary.fitness.values[0]
|
806
|
-
best_prompt_overall =
|
957
|
+
current_best_for_primary: creator.Individual = max(hof, key=lambda ind: ind.fitness.values[0])
|
958
|
+
best_primary_score_overall: float = current_best_for_primary.fitness.values[0]
|
959
|
+
best_prompt_overall = chat_prompt.ChatPrompt(messages=current_best_for_primary)
|
807
960
|
else:
|
808
961
|
# Single-objective
|
809
962
|
current_best_on_front = hof[0]
|
810
|
-
best_primary_score_overall = current_best_on_front.fitness.values[0]
|
963
|
+
best_primary_score_overall: float = current_best_on_front.fitness.values[0]
|
811
964
|
|
812
965
|
if self.enable_moo:
|
813
|
-
logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {best_prompt_overall[:100]}...")
|
966
|
+
logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {json.dumps(best_prompt_overall.formatted_messages)[:100]}...")
|
814
967
|
else:
|
815
968
|
logger.info(f"Gen {0}: New best score: {best_primary_score_overall:.4f}")
|
816
969
|
|
@@ -826,132 +979,65 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
826
979
|
).dict()
|
827
980
|
self._add_to_history(initial_round_data)
|
828
981
|
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
total=self.num_generations,
|
833
|
-
desc=pbar_desc,
|
834
|
-
unit="gen",
|
835
|
-
disable=self.verbose < 1,
|
836
|
-
postfix={pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter}
|
837
|
-
)
|
982
|
+
with reporting.start_evolutionary_algo(verbose=self.verbose) as report_evolutionary_algo:
|
983
|
+
for generation_idx in range(1, self.num_generations + 1):
|
984
|
+
report_evolutionary_algo.start_gen(generation_idx, self.num_generations)
|
838
985
|
|
839
|
-
|
840
|
-
for gen_idx in range(1, self.num_generations + 1):
|
841
|
-
gen = gen_idx
|
842
|
-
self._current_generation = gen
|
843
|
-
pbar.set_postfix({pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter})
|
844
|
-
previous_best_primary_score_for_gen = best_primary_score_overall
|
845
|
-
|
846
|
-
# Population restart logic
|
847
|
-
current_pop_best_primary = 0.0
|
848
|
-
if deap_population and deap_population[0].fitness.valid:
|
849
|
-
current_pop_best_primary = max(ind.fitness.values[0] for ind in deap_population if ind.fitness.valid)
|
850
|
-
|
851
|
-
if self._best_fitness_history and current_pop_best_primary < self._best_fitness_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD):
|
852
|
-
self._generations_without_improvement += 1
|
853
|
-
else:
|
854
|
-
self._generations_without_improvement = 0
|
855
|
-
self._best_fitness_history.append(current_pop_best_primary)
|
986
|
+
curr_best_score = self._population_best_score(deap_population)
|
856
987
|
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
deap_population = [creator.Individual(p_str) for p_str in new_population_strings]
|
864
|
-
self._generations_without_improvement = 0
|
865
|
-
fitnesses_new = list(map(self.toolbox.evaluate, deap_population))
|
866
|
-
for ind, fit in zip(deap_population, fitnesses_new):
|
867
|
-
ind.fitness.values = fit
|
868
|
-
# Offspring will be selected from this new population in the next step
|
869
|
-
|
870
|
-
# Standard DEAP evolutionary algorithm steps
|
871
|
-
if self.enable_moo:
|
872
|
-
# NSGA-II is used for MOO
|
873
|
-
offspring = self.toolbox.select(deap_population, self.population_size)
|
874
|
-
else:
|
875
|
-
# Single-objective: Elitism + Selection
|
876
|
-
elites = tools.selBest(deap_population, self.elitism_size)
|
877
|
-
selected_offspring = self.toolbox.select(deap_population, len(deap_population) - self.elitism_size)
|
878
|
-
offspring = elites + selected_offspring
|
879
|
-
|
880
|
-
# Set up the offspring for the next generation
|
881
|
-
offspring = list(map(self.toolbox.clone, offspring))
|
882
|
-
for child1, child2 in zip(offspring[::2], offspring[1::2]):
|
883
|
-
if random.random() < self.crossover_rate:
|
884
|
-
self.toolbox.mate(child1, child2)
|
885
|
-
del child1.fitness.values
|
886
|
-
del child2.fitness.values
|
887
|
-
|
888
|
-
# Mutate the offspring
|
889
|
-
current_mutation_rate = self._get_adaptive_mutation_rate()
|
890
|
-
for mutant in offspring:
|
891
|
-
if random.random() < current_mutation_rate:
|
892
|
-
self.toolbox.mutate(mutant, task_config=task_config)
|
893
|
-
del mutant.fitness.values
|
894
|
-
|
895
|
-
# Evaluate the offspring
|
896
|
-
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
|
897
|
-
fitnesses_eval = map(self.toolbox.evaluate, invalid_ind)
|
898
|
-
for ind, fit in zip(invalid_ind, fitnesses_eval):
|
899
|
-
ind.fitness.values = fit
|
900
|
-
|
901
|
-
# Update the hall of fame
|
902
|
-
hof.update(offspring)
|
903
|
-
deap_population[:] = offspring # Replace population
|
988
|
+
# ---------- restart logic -------------------------------------
|
989
|
+
if self._should_restart_population(curr_best_score):
|
990
|
+
report_evolutionary_algo.restart_population(self.DEFAULT_RESTART_GENERATIONS)
|
991
|
+
deap_population = self._restart_population(
|
992
|
+
hof, deap_population, best_prompt_overall
|
993
|
+
)
|
904
994
|
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
995
|
+
# ---------- run one generation --------------------------------
|
996
|
+
deap_population, invalid_count = self._run_generation(
|
997
|
+
generation_idx, deap_population, prompt, hof, report_evolutionary_algo, best_primary_score_overall
|
998
|
+
)
|
999
|
+
|
1000
|
+
# -------- update best-prompt bookkeeping -------------------------
|
1001
|
+
previous_best_primary_score_for_gen = best_primary_score_overall
|
1002
|
+
if hof:
|
1003
|
+
if self.enable_moo:
|
1004
|
+
current_best_ind = max(hof, key=lambda ind: ind.fitness.values[0])
|
1005
|
+
else:
|
1006
|
+
current_best_ind = hof[0]
|
1007
|
+
|
1008
|
+
updated_best_primary_score = current_best_ind.fitness.values[0]
|
1009
|
+
if updated_best_primary_score > best_primary_score_overall:
|
1010
|
+
best_primary_score_overall = updated_best_primary_score
|
1011
|
+
self._generations_without_overall_improvement = 0
|
1012
|
+
elif updated_best_primary_score == previous_best_primary_score_for_gen:
|
1013
|
+
self._generations_without_overall_improvement += 1
|
1014
|
+
else:
|
1015
|
+
self._generations_without_overall_improvement += 1
|
923
1016
|
else:
|
924
|
-
# Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
|
925
1017
|
self._generations_without_overall_improvement += 1
|
926
|
-
else:
|
927
|
-
# Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
|
928
|
-
self._generations_without_overall_improvement += 1
|
929
|
-
|
930
|
-
record = record_stats.compile(deap_population) if record_stats else {}
|
931
|
-
logbook.record(gen=gen, evals=len(invalid_ind), **record)
|
932
|
-
if self.verbose >= 1:
|
933
|
-
print(logbook.stream)
|
934
|
-
|
935
|
-
# History logging for this transition
|
936
|
-
# FIXME: Use model.dump() instead of dict()
|
937
|
-
gen_round_data = OptimizationRound(
|
938
|
-
round_number=gen,
|
939
|
-
current_prompt=best_prompt_overall, # Representative best
|
940
|
-
current_score=best_primary_score_overall,
|
941
|
-
generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
|
942
|
-
best_prompt=best_prompt_overall,
|
943
|
-
best_score=best_primary_score_overall,
|
944
|
-
improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
|
945
|
-
).dict()
|
946
|
-
self._add_to_history(gen_round_data)
|
947
|
-
pbar.update(1)
|
948
1018
|
|
949
|
-
|
950
|
-
|
951
|
-
|
1019
|
+
# ---------- early-stopping check ------------------------------
|
1020
|
+
if self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS:
|
1021
|
+
logger.info(
|
1022
|
+
"No overall improvement for %d generations – early stopping at gen %d.",
|
1023
|
+
self.DEFAULT_EARLY_STOPPING_GENERATIONS,
|
1024
|
+
generation_idx,
|
1025
|
+
)
|
1026
|
+
break
|
1027
|
+
|
1028
|
+
# History logging for this transition
|
1029
|
+
# FIXME: Use model.dump() instead of dict()
|
1030
|
+
gen_round_data = OptimizationRound(
|
1031
|
+
round_number=generation_idx,
|
1032
|
+
current_prompt=best_prompt_overall, # Representative best
|
1033
|
+
current_score=best_primary_score_overall,
|
1034
|
+
generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
|
1035
|
+
best_prompt=best_prompt_overall,
|
1036
|
+
best_score=best_primary_score_overall,
|
1037
|
+
improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
|
1038
|
+
).dict()
|
1039
|
+
self._add_to_history(gen_round_data)
|
952
1040
|
|
953
|
-
pbar.close()
|
954
|
-
logger.info(f"\n{log_prefix} Evolutionary Optimization finished after {gen} generations.")
|
955
1041
|
stopped_early_flag = self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
|
956
1042
|
final_details = {}
|
957
1043
|
initial_score_for_display = initial_primary_score
|
@@ -963,12 +1049,12 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
963
1049
|
for i, sol in enumerate(sorted_hof):
|
964
1050
|
final_results_log += f" Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
|
965
1051
|
best_overall_solution = sorted_hof[0]
|
966
|
-
final_best_prompt =
|
1052
|
+
final_best_prompt = chat_prompt.ChatPrompt(messages=best_overall_solution)
|
967
1053
|
final_primary_score = best_overall_solution.fitness.values[0]
|
968
1054
|
final_length = best_overall_solution.fitness.values[1]
|
969
1055
|
logger.info(final_results_log)
|
970
1056
|
logger.info(f"Representative best prompt (highest primary score from Pareto front): '{final_best_prompt}'")
|
971
|
-
logger.info(f" Primary Score ({
|
1057
|
+
logger.info(f" Primary Score ({metric.__name__}): {final_primary_score:.4f}")
|
972
1058
|
logger.info(f" Length: {final_length:.0f}")
|
973
1059
|
final_details.update({
|
974
1060
|
"initial_primary_score": initial_primary_score,
|
@@ -986,7 +1072,7 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
986
1072
|
logger.warning("MOO: ParetoFront is empty. Reporting last known best.")
|
987
1073
|
final_best_prompt = best_prompt_overall
|
988
1074
|
final_primary_score = best_primary_score_overall
|
989
|
-
final_length = float(len(final_best_prompt))
|
1075
|
+
final_length = float(len(json.dumps(final_best_prompt.formatted_messages)))
|
990
1076
|
final_details.update({"initial_primary_score": initial_primary_score, "initial_length": initial_length,
|
991
1077
|
"final_prompt_representative": final_best_prompt, "final_primary_score_representative": final_primary_score,
|
992
1078
|
"final_length_representative": final_length, "pareto_front_solutions": []})
|
@@ -995,9 +1081,9 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
995
1081
|
final_best_prompt = best_prompt_overall
|
996
1082
|
final_primary_score = best_primary_score_overall
|
997
1083
|
logger.info(f"Final best prompt from Hall of Fame: '{final_best_prompt}'")
|
998
|
-
logger.info(f"Final best score ({
|
1084
|
+
logger.info(f"Final best score ({metric.__name__}): {final_primary_score:.4f}")
|
999
1085
|
final_details.update({
|
1000
|
-
"initial_prompt":
|
1086
|
+
"initial_prompt": prompt.formatted_messages,
|
1001
1087
|
"initial_score": initial_primary_score,
|
1002
1088
|
"initial_score_for_display": initial_primary_score,
|
1003
1089
|
"final_prompt": final_best_prompt,
|
@@ -1014,15 +1100,13 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1014
1100
|
|
1015
1101
|
# Add final details
|
1016
1102
|
final_details.update({
|
1017
|
-
"total_generations_run":
|
1103
|
+
"total_generations_run": generation_idx + 1,
|
1018
1104
|
"population_size": self.population_size,
|
1019
1105
|
"mutation_probability": self.mutation_rate,
|
1020
1106
|
"crossover_probability": self.crossover_rate,
|
1021
1107
|
"elitism_size": self.elitism_size if not self.enable_moo else "N/A (MOO uses NSGA-II)",
|
1022
1108
|
"adaptive_mutation": self.adaptive_mutation,
|
1023
|
-
"
|
1024
|
-
"task_config": task_config.dict(),
|
1025
|
-
"metric_config": metric_config.dict(),
|
1109
|
+
"metric_name": metric.__name__,
|
1026
1110
|
"model": self.model,
|
1027
1111
|
"moo_enabled": self.enable_moo,
|
1028
1112
|
"llm_crossover_enabled": self.enable_llm_crossover,
|
@@ -1039,11 +1123,17 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1039
1123
|
})
|
1040
1124
|
|
1041
1125
|
# Return the OptimizationResult
|
1126
|
+
reporting.display_result(
|
1127
|
+
initial_score=initial_score_for_display,
|
1128
|
+
best_score=final_primary_score,
|
1129
|
+
best_prompt=final_best_prompt.formatted_messages,
|
1130
|
+
verbose=self.verbose
|
1131
|
+
)
|
1042
1132
|
return OptimizationResult(
|
1043
1133
|
optimizer=self.__class__.__name__,
|
1044
|
-
prompt=final_best_prompt,
|
1134
|
+
prompt=final_best_prompt.formatted_messages,
|
1045
1135
|
score=final_primary_score,
|
1046
|
-
metric_name=
|
1136
|
+
metric_name=metric.__name__,
|
1047
1137
|
details=final_details,
|
1048
1138
|
history=self.get_history(),
|
1049
1139
|
llm_calls=self.llm_call_counter
|
@@ -1052,8 +1142,7 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1052
1142
|
@_throttle.rate_limited(_rate_limiter)
|
1053
1143
|
def _call_model(
|
1054
1144
|
self,
|
1055
|
-
|
1056
|
-
system_prompt: Optional[str] = None,
|
1145
|
+
messages: List[Dict[str, str]],
|
1057
1146
|
is_reasoning: bool = False,
|
1058
1147
|
optimization_id: Optional[str] = None,
|
1059
1148
|
) -> str:
|
@@ -1084,11 +1173,6 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1084
1173
|
if metadata_for_opik:
|
1085
1174
|
llm_config_params["metadata"] = metadata_for_opik
|
1086
1175
|
|
1087
|
-
messages = []
|
1088
|
-
if system_prompt:
|
1089
|
-
messages.append({"role": "system", "content": system_prompt})
|
1090
|
-
messages.append({"role": "user", "content": prompt})
|
1091
|
-
|
1092
1176
|
# Pass llm_config_params to the Opik monitor
|
1093
1177
|
final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
|
1094
1178
|
llm_config_params.copy()
|
@@ -1102,14 +1186,16 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1102
1186
|
response = litellm.completion(
|
1103
1187
|
model=self.model, messages=messages, **final_call_params
|
1104
1188
|
)
|
1189
|
+
|
1190
|
+
logger.debug(f"Response: {response}")
|
1105
1191
|
return response.choices[0].message.content
|
1106
|
-
except
|
1192
|
+
except litellm_exceptions.RateLimitError as e:
|
1107
1193
|
logger.error(f"LiteLLM Rate Limit Error: {e}")
|
1108
1194
|
raise
|
1109
|
-
except
|
1195
|
+
except litellm_exceptions.APIConnectionError as e:
|
1110
1196
|
logger.error(f"LiteLLM API Connection Error: {e}")
|
1111
1197
|
raise
|
1112
|
-
except
|
1198
|
+
except litellm_exceptions.ContextWindowExceededError as e:
|
1113
1199
|
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
1114
1200
|
raise
|
1115
1201
|
except Exception as e:
|
@@ -1118,10 +1204,9 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1118
1204
|
|
1119
1205
|
def evaluate_prompt(
|
1120
1206
|
self,
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
prompt: str,
|
1207
|
+
prompt: chat_prompt.ChatPrompt,
|
1208
|
+
dataset: opik.Dataset,
|
1209
|
+
metric: Callable,
|
1125
1210
|
n_samples: Optional[int] = None,
|
1126
1211
|
dataset_item_ids: Optional[List[str]] = None,
|
1127
1212
|
experiment_config: Optional[Dict] = None,
|
@@ -1130,57 +1215,32 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1130
1215
|
) -> float:
|
1131
1216
|
"""
|
1132
1217
|
Evaluate a single prompt (individual) against the dataset.
|
1133
|
-
|
1218
|
+
|
1219
|
+
Args:
|
1220
|
+
prompt: The prompt to evaluate
|
1221
|
+
dataset: The dataset to use for evaluation
|
1222
|
+
metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
|
1223
|
+
n_samples: Optional number of samples to use
|
1224
|
+
dataset_item_ids: Optional list of dataset item IDs to use
|
1225
|
+
experiment_config: Optional experiment configuration
|
1226
|
+
optimization_id: Optional optimization ID
|
1227
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
1228
|
+
|
1229
|
+
Returns:
|
1230
|
+
float: The metric value
|
1134
1231
|
"""
|
1135
|
-
|
1136
|
-
|
1137
|
-
if isinstance(dataset, str):
|
1138
|
-
# This should ideally be done once in optimize_prompt if dataset is a string
|
1139
|
-
# but if called standalone, we need to handle it.
|
1140
|
-
# TODO Move to base class
|
1141
|
-
opik_eval_dataset = self._opik_client.get_dataset(dataset)
|
1142
|
-
else:
|
1143
|
-
opik_eval_dataset = dataset
|
1144
|
-
|
1145
|
-
total_items = len(opik_eval_dataset.get_items())
|
1232
|
+
total_items = len(dataset.get_items())
|
1146
1233
|
|
1147
|
-
# Determine subset_size for this evaluation run
|
1148
|
-
# TODO Move to dataset utils
|
1149
|
-
if dataset_item_ids:
|
1150
|
-
subset_size = len(dataset_item_ids)
|
1151
|
-
logger.debug(f"Using provided {subset_size} dataset_item_ids for evaluation.")
|
1152
|
-
elif n_samples is not None:
|
1153
|
-
if n_samples > total_items:
|
1154
|
-
logger.warning(
|
1155
|
-
f"Requested n_samples ({n_samples}) for individual evaluation is larger than dataset size ({total_items}). Using full dataset."
|
1156
|
-
)
|
1157
|
-
subset_size = None
|
1158
|
-
elif n_samples <= 0:
|
1159
|
-
logger.warning(
|
1160
|
-
f"Requested n_samples ({n_samples}) is <=0. Using full dataset for this evaluation."
|
1161
|
-
)
|
1162
|
-
subset_size = None
|
1163
|
-
else:
|
1164
|
-
subset_size = n_samples
|
1165
|
-
logger.debug(f"Using specified n_samples: {subset_size} items for this evaluation run.")
|
1166
|
-
else:
|
1167
|
-
# Default behavior if no n_samples and no dataset_item_ids are given for this specific call
|
1168
|
-
# This case should be rare if n_samples is passed down from optimize_prompt
|
1169
|
-
subset_size = min(total_items, min(20, max(10, int(total_items * 0.2))))
|
1170
|
-
logger.debug(
|
1171
|
-
f"Using automatic subset size for this evaluation: {subset_size} items (20% of {total_items} total items)"
|
1172
|
-
)
|
1173
|
-
|
1174
1234
|
current_experiment_config = experiment_config or {}
|
1175
1235
|
current_experiment_config = {
|
1176
1236
|
**current_experiment_config,
|
1177
1237
|
**{
|
1178
1238
|
"optimizer": self.__class__.__name__,
|
1179
|
-
"metric":
|
1180
|
-
"dataset":
|
1239
|
+
"metric": metric.__name__,
|
1240
|
+
"dataset": dataset.name,
|
1181
1241
|
"configuration": {
|
1182
|
-
"
|
1183
|
-
"n_samples_for_eval":
|
1242
|
+
"prompt": prompt.formatted_messages,
|
1243
|
+
"n_samples_for_eval": len(dataset_item_ids) if dataset_item_ids is not None else n_samples,
|
1184
1244
|
"total_dataset_items": total_items,
|
1185
1245
|
},
|
1186
1246
|
},
|
@@ -1189,81 +1249,35 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1189
1249
|
def llm_task(
|
1190
1250
|
dataset_item: Dict[str, Any]
|
1191
1251
|
) -> Dict[str, str]:
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
prompt_for_llm: str
|
1202
|
-
field_mapping = {
|
1203
|
-
field: dataset_item[field]
|
1204
|
-
for field in task_config.input_dataset_fields
|
1205
|
-
if field in dataset_item
|
1206
|
-
}
|
1207
|
-
|
1208
|
-
if getattr(task_config, "use_chat_prompt", False):
|
1209
|
-
candidate_template = Template(prompt)
|
1210
|
-
user_content_parts = []
|
1211
|
-
for field_name in task_config.input_dataset_fields:
|
1212
|
-
if field_name in dataset_item:
|
1213
|
-
user_content_parts.append(f"{field_name.capitalize()}: {dataset_item[field_name]}")
|
1214
|
-
user_content = "\n".join(user_content_parts)
|
1215
|
-
|
1216
|
-
raw_model_output = self._call_model(
|
1217
|
-
prompt=user_content,
|
1218
|
-
system_prompt=prompt,
|
1219
|
-
is_reasoning=False
|
1220
|
-
)
|
1221
|
-
|
1222
|
-
else:
|
1223
|
-
input_clauses = []
|
1224
|
-
for field_name in task_config.input_dataset_fields:
|
1225
|
-
if field_name in dataset_item:
|
1226
|
-
input_clauses.append(
|
1227
|
-
f"{field_name.capitalize()}: {dataset_item[field_name]}"
|
1228
|
-
)
|
1229
|
-
item_specific_inputs_str = "\n".join(input_clauses)
|
1230
|
-
prompt_for_llm = f"{prompt}\n\n{item_specific_inputs_str}"
|
1231
|
-
|
1232
|
-
raw_model_output = self._call_model(
|
1233
|
-
prompt=prompt_for_llm,
|
1234
|
-
system_prompt=None,
|
1235
|
-
is_reasoning=False
|
1236
|
-
)
|
1252
|
+
try:
|
1253
|
+
messages = [{
|
1254
|
+
"role": item["role"],
|
1255
|
+
"content": item["content"].format(**dataset_item)
|
1256
|
+
} for item in prompt.formatted_messages]
|
1257
|
+
except Exception as e:
|
1258
|
+
logger.warning(f"Error in llm_task, this is usually a parsing error: {e}")
|
1259
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: ""}
|
1237
1260
|
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1242
|
-
if cleaned_model_output.lower().startswith(prefix.lower()):
|
1243
|
-
cleaned_model_output = cleaned_model_output[len(prefix):].strip()
|
1244
|
-
break
|
1261
|
+
model_output = self._call_model(
|
1262
|
+
messages=messages,
|
1263
|
+
is_reasoning=False
|
1264
|
+
)
|
1245
1265
|
|
1246
|
-
return {mappers.EVALUATED_LLM_TASK_OUTPUT:
|
1247
|
-
|
1248
|
-
logger.debug(
|
1249
|
-
f"Starting evaluation for a prompt with {subset_size if subset_size else 'all'} samples (or specific IDs) for metric: {metric_config.metric.name}"
|
1250
|
-
)
|
1266
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
|
1251
1267
|
|
1252
1268
|
# Evaluate the prompt
|
1253
1269
|
score = task_evaluator.evaluate(
|
1254
|
-
dataset=
|
1270
|
+
dataset=dataset,
|
1255
1271
|
dataset_item_ids=dataset_item_ids,
|
1256
|
-
|
1272
|
+
metric=metric,
|
1257
1273
|
evaluated_task=llm_task,
|
1258
1274
|
num_threads=self.num_threads,
|
1259
1275
|
project_name=self.project_name,
|
1260
|
-
n_samples=
|
1276
|
+
n_samples=n_samples if dataset_item_ids is None else None,
|
1261
1277
|
experiment_config=current_experiment_config,
|
1262
1278
|
optimization_id=optimization_id,
|
1263
|
-
|
1264
|
-
#verbose=effective_verbose,
|
1279
|
+
verbose=verbose
|
1265
1280
|
)
|
1266
|
-
logger.debug(f"Evaluation score for prompt: {score:.4f}")
|
1267
1281
|
return score
|
1268
1282
|
|
1269
1283
|
def _llm_deap_crossover(
|
@@ -1272,42 +1286,48 @@ Ensure a good mix of variations, all targeting the specified output style from t
|
|
1272
1286
|
ind2: "creator.Individual"
|
1273
1287
|
) -> Tuple["creator.Individual", "creator.Individual"]:
|
1274
1288
|
"""Perform crossover by asking an LLM to blend two parent prompts."""
|
1275
|
-
|
1276
|
-
|
1289
|
+
reporting.display_message(" Recombining prompts using an LLM.", verbose=self.verbose)
|
1290
|
+
|
1291
|
+
parent1_messages: List[Dict[Literal["role", "content"], str]] = ind1
|
1292
|
+
parent2_messages: List[Dict[Literal["role", "content"], str]] = ind2
|
1277
1293
|
current_output_style_guidance = self.output_style_guidance
|
1278
1294
|
|
1279
1295
|
user_prompt_for_llm_crossover = f"""Parent Prompt 1:
|
1280
|
-
'''{
|
1296
|
+
'''{parent1_messages}'''
|
1281
1297
|
|
1282
1298
|
Parent Prompt 2:
|
1283
|
-
'''{
|
1299
|
+
'''{parent2_messages}'''
|
1284
1300
|
|
1285
1301
|
Desired output style from target LLM for children prompts: '{current_output_style_guidance}'
|
1286
1302
|
|
1287
|
-
Please generate
|
1288
|
-
Follow the instructions provided in the system prompt regarding the JSON output format
|
1303
|
+
Please generate TWO child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
|
1304
|
+
Follow the instructions provided in the system prompt regarding the JSON output format:
|
1305
|
+
[
|
1306
|
+
[{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_1
|
1307
|
+
[{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_2
|
1308
|
+
]
|
1289
1309
|
"""
|
1290
1310
|
try:
|
1291
|
-
logger.debug(f"Attempting LLM-driven crossover between: '{
|
1311
|
+
logger.debug(f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'")
|
1292
1312
|
response_content = self._call_model(
|
1293
|
-
|
1294
|
-
|
1313
|
+
messages=[
|
1314
|
+
{"role": "system", "content": self.get_llm_crossover_system_prompt()},
|
1315
|
+
{"role": "user", "content": user_prompt_for_llm_crossover},
|
1316
|
+
],
|
1295
1317
|
is_reasoning=True
|
1296
1318
|
)
|
1297
1319
|
logger.debug(f"Raw LLM response for crossover: {response_content}")
|
1298
1320
|
|
1299
|
-
json_response =
|
1300
|
-
|
1301
|
-
|
1302
|
-
if not children_strings or not isinstance(children_strings, list) or not all(isinstance(cs, str) for cs in children_strings):
|
1321
|
+
json_response = utils.json_to_dict(response_content)
|
1322
|
+
if not isinstance(json_response, list) or len(json_response) != 2 or not all(isinstance(cs, list) for cs in json_response):
|
1303
1323
|
logger.warning("LLM Crossover: Malformed or empty children_prompts list. Falling back.")
|
1304
1324
|
raise ValueError("Malformed LLM crossover response")
|
1305
1325
|
|
1306
|
-
|
1307
|
-
|
1326
|
+
child1: List[Dict[Literal["role", "content"], str]] = json_response[0]
|
1327
|
+
child2: List[Dict[Literal["role", "content"], str]] = json_response[1]
|
1308
1328
|
|
1309
|
-
logger.debug(f"LLM Crossover generated child1: {
|
1310
|
-
return creator.Individual(
|
1329
|
+
logger.debug(f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}...")
|
1330
|
+
return creator.Individual(child1), creator.Individual(child2)
|
1311
1331
|
|
1312
1332
|
except Exception as e:
|
1313
1333
|
logger.warning(f"LLM-driven crossover failed: {e}. Falling back to standard crossover.")
|
@@ -1315,17 +1335,15 @@ Follow the instructions provided in the system prompt regarding the JSON output
|
|
1315
1335
|
|
1316
1336
|
def _get_task_description_for_llm(
|
1317
1337
|
self,
|
1318
|
-
|
1338
|
+
prompt: chat_prompt.ChatPrompt
|
1319
1339
|
) -> str:
|
1320
1340
|
"""Generates a concise task description for use in LLM prompts for fresh generation or radical innovation."""
|
1321
|
-
|
1322
|
-
|
1323
|
-
description = f"Task: Given input(s) from field(s) '{input_fields_str}', generate a response for the field '{output_field_str}'. "
|
1324
|
-
description += f"The original high-level instruction being optimized is: '{task_config.instruction_prompt}'. "
|
1341
|
+
description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
|
1342
|
+
description += f"The original high-level instruction being optimized is: '{prompt.formatted_messages}'. "
|
1325
1343
|
description += "The goal is to create an effective prompt that guides a language model to perform this task well."
|
1326
1344
|
return description
|
1327
1345
|
|
1328
|
-
def
|
1346
|
+
def _get_reasoning_system_prompt_for_variation(self) -> str:
|
1329
1347
|
return f"""You are an expert prompt engineer specializing in creating diverse and effective prompts. Given an initial prompt, your task is to generate a diverse set of alternative prompts.
|
1330
1348
|
|
1331
1349
|
For each prompt variation, consider:
|
@@ -1367,13 +1385,18 @@ Consider the following when generating children:
|
|
1367
1385
|
- You can create a child that is a direct blend, or one that takes a primary structure from one parent and incorporates specific elements from the other, always optimizing for clear instruction towards the desired output style.
|
1368
1386
|
- If generating two children, try to make them distinct from each other and from the parents, perhaps by emphasizing different aspects of the parental combination that could lead to the desired output style.
|
1369
1387
|
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1388
|
+
All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
|
1389
|
+
|
1390
|
+
Return a JSON object that is a list of both child prompts. Each child prompt is a list of LLM messages. Example:
|
1391
|
+
[
|
1392
|
+
[{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}],
|
1393
|
+
[{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}]
|
1394
|
+
]
|
1395
|
+
|
1396
|
+
|
1374
1397
|
"""
|
1375
1398
|
|
1376
|
-
def
|
1399
|
+
def _get_radical_innovation_system_prompt(self) -> str:
|
1377
1400
|
return f"""You are an expert prompt engineer and a creative problem solver.
|
1378
1401
|
Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
|
1379
1402
|
Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
|
@@ -1384,66 +1407,59 @@ Return only the new prompt string, with no preamble or explanation.
|
|
1384
1407
|
def _infer_output_style_from_dataset(
|
1385
1408
|
self,
|
1386
1409
|
dataset: opik.Dataset,
|
1387
|
-
|
1410
|
+
prompt: chat_prompt.ChatPrompt,
|
1388
1411
|
n_examples: int = 5
|
1389
1412
|
) -> Optional[str]:
|
1390
1413
|
"""Analyzes dataset examples to infer the desired output style."""
|
1391
|
-
|
1392
|
-
|
1393
|
-
|
1394
|
-
|
1395
|
-
|
1396
|
-
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
items_to_process = all_items[:n_examples]
|
1404
|
-
|
1405
|
-
# Need at least a couple of examples for meaningful inference
|
1406
|
-
if len(items_to_process) < min(n_examples, 2):
|
1407
|
-
logger.warning(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
|
1408
|
-
return None
|
1409
|
-
|
1410
|
-
examples_str = ""
|
1411
|
-
for i, item_obj in enumerate(items_to_process):
|
1412
|
-
item_content = item_obj.content if hasattr(item_obj, 'content') else item_obj
|
1413
|
-
if not isinstance(item_content, dict):
|
1414
|
-
logger.warning(f"Dataset item {i} does not have a .content dictionary or is not a dict itself. Skipping item: {item_obj}")
|
1415
|
-
continue
|
1414
|
+
with reporting.infer_output_style(verbose=self.verbose) as report_infer_output_style:
|
1415
|
+
report_infer_output_style.start_style_inference(n_examples)
|
1416
|
+
|
1417
|
+
try:
|
1418
|
+
items_to_process = dataset.get_items(n_examples)
|
1419
|
+
except Exception as e:
|
1420
|
+
report_infer_output_style.error(f"Failed to get items from dataset '{dataset.name}': {e}")
|
1421
|
+
return None
|
1422
|
+
|
1423
|
+
if not items_to_process:
|
1424
|
+
report_infer_output_style.error(f"Dataset '{dataset.name}' is empty. Cannot infer output style.")
|
1425
|
+
return None
|
1416
1426
|
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
input_str = "\n".join(input_parts)
|
1422
|
-
output_str = item_content.get(task_config.output_dataset_field, "[NO OUTPUT FIELD FOUND]")
|
1423
|
-
examples_str += f"Example {i+1}:\nInput(s):\n{input_str}\nOutput: {output_str}\n---\n"
|
1427
|
+
# Need at least a couple of examples for meaningful inference
|
1428
|
+
if len(items_to_process) < min(n_examples, 2):
|
1429
|
+
report_infer_output_style.error(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
|
1430
|
+
return None
|
1424
1431
|
|
1425
|
-
|
1432
|
+
examples_str = ""
|
1433
|
+
for i, item_content in enumerate(items_to_process):
|
1434
|
+
filtered_content = {x: y for x, y in item_content.items() if x != "id"}
|
1435
|
+
examples_str += f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
|
1426
1436
|
|
1427
|
-
|
1437
|
+
user_prompt_for_style_inference = f"""Please analyze the following examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. Before describing the output style, make sure to understand the dataset content and structure as it can include input, output and metadata fields. This description will be used to guide other LLMs in generating and refining prompts.
|
1428
1438
|
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
|
1444
|
-
|
1445
|
-
|
1439
|
+
{examples_str}
|
1440
|
+
|
1441
|
+
Based on these examples, what is the desired output style description?
|
1442
|
+
Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
|
1443
|
+
The description should be a single string that can be directly used as an instruction for another LLM.
|
1444
|
+
Return ONLY this descriptive string.
|
1445
|
+
"""
|
1446
|
+
#report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
|
1447
|
+
|
1448
|
+
try:
|
1449
|
+
inferred_style = self._call_model(
|
1450
|
+
messages=[
|
1451
|
+
{"role": "system", "content": self._INFER_STYLE_SYSTEM_PROMPT},
|
1452
|
+
{"role": "user", "content": user_prompt_for_style_inference}
|
1453
|
+
],
|
1454
|
+
is_reasoning=True
|
1455
|
+
)
|
1456
|
+
inferred_style = inferred_style.strip()
|
1457
|
+
if inferred_style:
|
1458
|
+
report_infer_output_style.success(inferred_style)
|
1459
|
+
return inferred_style
|
1460
|
+
else:
|
1461
|
+
report_infer_output_style.error("LLM returned empty string for inferred output style.")
|
1462
|
+
return None
|
1463
|
+
except Exception as e:
|
1464
|
+
report_infer_output_style.error(f"Error during output style inference: {e}")
|
1446
1465
|
return None
|
1447
|
-
except Exception as e:
|
1448
|
-
logger.error(f"Error during output style inference: {e}")
|
1449
|
-
return None
|