opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. opik_optimizer/__init__.py +15 -26
  2. opik_optimizer/base_optimizer.py +28 -44
  3. opik_optimizer/data/hotpot-500.json +501 -1001
  4. opik_optimizer/datasets/__init__.py +6 -7
  5. opik_optimizer/datasets/hotpot_qa.py +2 -1
  6. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
  7. opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
  8. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
  9. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
  10. opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
  11. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
  12. opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
  13. opik_optimizer/mipro_optimizer/__init__.py +1 -1
  14. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
  15. opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
  16. opik_optimizer/mipro_optimizer/utils.py +1 -23
  17. opik_optimizer/optimization_config/chat_prompt.py +106 -0
  18. opik_optimizer/optimization_config/configs.py +2 -21
  19. opik_optimizer/optimization_config/mappers.py +1 -1
  20. opik_optimizer/optimization_result.py +57 -85
  21. opik_optimizer/reporting_utils.py +180 -0
  22. opik_optimizer/task_evaluator.py +41 -26
  23. opik_optimizer/utils.py +187 -3
  24. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
  25. opik_optimizer-0.9.0.dist-info/RECORD +48 -0
  26. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
  27. opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
  28. opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
  29. opik_optimizer/integrations/__init__.py +0 -0
  30. opik_optimizer/meta_prompt_optimizer.py +0 -1151
  31. opik_optimizer-0.8.0.dist-info/RECORD +0 -45
  32. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
  33. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,31 @@
1
- from typing import Optional, Union, List, Dict, Any, Tuple
2
- import opik
3
- import logging
4
- import random
5
1
  import json
6
- from string import Template
2
+ import logging
7
3
  import os
8
- import time
4
+ import random
5
+ from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, cast
6
+
9
7
  import Levenshtein
8
+ import litellm
10
9
  import numpy as np
10
+ import opik
11
11
 
12
- from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
13
- from opik_optimizer.optimization_config.configs import TaskConfig, MetricConfig
14
- from opik_optimizer.optimization_result import OptimizationResult
15
- from opik_optimizer import task_evaluator
16
- from opik_optimizer.optimization_config import mappers
17
- from opik.api_objects import opik_client
18
- from opik.environment import get_tqdm_for_current_environment
19
- from opik_optimizer import _throttle
20
- import litellm
12
+ # DEAP imports
13
+ from deap import base, tools
14
+ from deap import creator as _creator
15
+ from litellm import exceptions as litellm_exceptions
21
16
  from litellm.caching import Cache
17
+ from litellm.types.caching import LiteLLMCacheType
18
+ from opik.api_objects import opik_client, optimization
19
+ from opik.environment import get_tqdm_for_current_environment
22
20
  from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
23
21
 
24
- # DEAP imports
25
- from deap import base, creator, tools, algorithms
22
+ from opik_optimizer import _throttle, task_evaluator
23
+ from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
24
+ from opik_optimizer.optimization_config import chat_prompt, mappers
25
+ from opik_optimizer.optimization_result import OptimizationResult
26
+
27
+ from .. import utils
28
+ from . import reporting
26
29
 
27
30
  logger = logging.getLogger(__name__)
28
31
  tqdm = get_tqdm_for_current_environment()
@@ -30,13 +33,25 @@ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
30
33
 
31
34
  # Using disk cache for LLM calls
32
35
  disk_cache_dir = os.path.expanduser("~/.litellm_cache")
33
- litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
36
+ litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
37
+
38
+ creator = cast(Any, _creator) # type: ignore[assignment]
34
39
 
35
40
  class EvolutionaryOptimizer(BaseOptimizer):
36
41
  """
37
- Optimizes prompts using a genetic algorithm approach.
38
- Focuses on evolving the prompt text itself.
39
- Can operate in single-objective or multi-objective mode.
42
+ The Evolutionary Optimizer can be used to optimize prompts using a 4 stage genetic algorithm
43
+ approach:
44
+
45
+ 1. Generate a set of candidate prompts based on variations of the best prompts (exploitation) as
46
+ well as completely new prompts (exploration)
47
+ 2. Evaluate the candidate prompts
48
+ 3. Select the best prompts
49
+ 4. Repeat until convergence
50
+
51
+ This algorithm is best used if you have a first draft prompt and would like to find a better
52
+ prompt.
53
+
54
+ Note: This algorithm is time consuming and can be expensive to run.
40
55
  """
41
56
 
42
57
  DEFAULT_POPULATION_SIZE = 30
@@ -78,7 +93,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
78
93
  def __init__(
79
94
  self,
80
95
  model: str,
81
- project_name: Optional[str] = None,
96
+ project_name: str = "Optimization",
82
97
  population_size: int = DEFAULT_POPULATION_SIZE,
83
98
  num_generations: int = DEFAULT_NUM_GENERATIONS,
84
99
  mutation_rate: float = DEFAULT_MUTATION_RATE,
@@ -95,9 +110,26 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
95
110
  verbose: int = 1,
96
111
  **model_kwargs,
97
112
  ):
98
- # FIXME: Hack for verbose till its merged
99
- self.verbose = 1
100
-
113
+ """
114
+ Args:
115
+ model: The model to use for evaluation
116
+ project_name: Optional project name for tracking
117
+ population_size: Number of prompts in the population
118
+ num_generations: Number of generations to run
119
+ mutation_rate: Mutation rate for genetic operations
120
+ crossover_rate: Crossover rate for genetic operations
121
+ tournament_size: Tournament size for selection
122
+ num_threads: Number of threads for parallel evaluation
123
+ elitism_size: Number of elitism prompts
124
+ adaptive_mutation: Whether to use adaptive mutation
125
+ enable_moo: Whether to enable multi-objective optimization - When enable optimizes for both the supplied metric and the length of the prompt
126
+ enable_llm_crossover: Whether to enable LLM crossover
127
+ seed: Random seed for reproducibility
128
+ output_style_guidance: Output style guidance for prompts
129
+ infer_output_style: Whether to infer output style
130
+ verbose: Controls internal logging/progress bars (0=off, 1=on).
131
+ **model_kwargs: Additional model parameters
132
+ """
101
133
  # Initialize base class first
102
134
  super().__init__(model=model, project_name=project_name, **model_kwargs)
103
135
  self.population_size = population_size
@@ -122,12 +154,15 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
122
154
  self._llm_cache = {}
123
155
  self._current_population = []
124
156
  self._generations_without_overall_improvement = 0
157
+ self._best_primary_score_history: list[float] = []
158
+ self._gens_since_pop_improvement: int = 0
159
+ self.verbose = verbose
125
160
 
126
161
  if self.seed is not None:
127
162
  random.seed(self.seed)
128
163
  np.random.seed(self.seed)
129
164
  logger.info(f"Global random seed set to: {self.seed}")
130
- # Note: DEAP tools generally respect random.seed().
165
+ # Note: DEAP tools generally respect random.seed().
131
166
  # TODO investigate if specific DEAP components require separate seeding
132
167
 
133
168
  if self.enable_moo:
@@ -142,7 +177,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
142
177
  if not hasattr(creator, "Individual") or getattr(creator.Individual, "fitness") != fitness_attr:
143
178
  if hasattr(creator, "Individual"):
144
179
  del creator.Individual
145
- creator.create("Individual", str, fitness=fitness_attr)
180
+ creator.create("Individual", list, fitness=fitness_attr)
146
181
 
147
182
  self.toolbox = base.Toolbox()
148
183
  self.toolbox.register("default_individual", lambda: creator.Individual("placeholder"))
@@ -206,7 +241,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
206
241
  return 0.0
207
242
 
208
243
  # Calculate average Levenshtein distance between all pairs
209
- total_distance = 0
244
+ total_distance = 0.0
210
245
  count = 0
211
246
  for i in range(len(self._current_population)):
212
247
  for j in range(i + 1, len(self._current_population)):
@@ -221,18 +256,10 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
221
256
 
222
257
  return total_distance / count if count > 0 else 0.0
223
258
 
224
- def _deap_crossover(
225
- self,
226
- ind1: "creator.Individual",
227
- ind2: "creator.Individual"
228
- ) -> Tuple["creator.Individual", "creator.Individual"]:
229
- """Enhanced crossover operation that preserves semantic meaning.
230
- Attempts chunk-level crossover first, then falls back to word-level.
231
- """
232
- str1_orig, str2_orig = str(ind1), str(ind2)
233
259
 
234
- chunks1 = [chunk.strip() for chunk in str1_orig.split('.') if chunk.strip()]
235
- chunks2 = [chunk.strip() for chunk in str2_orig.split('.') if chunk.strip()]
260
+ def _deap_crossover_chunking_strategy(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
261
+ chunks1 = [chunk.strip() for chunk in messages_1_str.split('.') if chunk.strip()]
262
+ chunks2 = [chunk.strip() for chunk in messages_2_str.split('.') if chunk.strip()]
236
263
 
237
264
  # Try chunk-level crossover if both parents have at least 2 chunks
238
265
  if len(chunks1) >= 2 and len(chunks2) >= 2:
@@ -247,35 +274,73 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
247
274
  child1_str = '. '.join(child1_chunks) + ('.' if child1_chunks else '')
248
275
  child2_str = '. '.join(child2_chunks) + ('.' if child2_chunks else '')
249
276
 
250
- return creator.Individual(child1_str), creator.Individual(child2_str)
251
-
252
- # Fallback to word-level crossover if chunk-level is not suitable
253
- words1 = str1_orig.split()
254
- words2 = str2_orig.split()
277
+ return child1_str, child2_str
278
+ else:
279
+ raise ValueError("Not enough chunks in either prompt for chunk-level crossover")
280
+
281
+ def _deap_crossover_word_level(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
282
+ words1 = messages_1_str.split()
283
+ words2 = messages_2_str.split()
255
284
 
256
285
  # If either prompt is empty (no words), return parents
257
286
  if not words1 or not words2:
258
- return ind1, ind2
287
+ return messages_1_str, messages_2_str
259
288
 
260
289
  min_word_len = min(len(words1), len(words2))
261
290
  # Need at least 2 words in the shorter prompt for a valid crossover point
262
291
  if min_word_len < 2:
263
- return ind1, ind2
292
+ return messages_1_str, messages_2_str
264
293
 
265
294
  # Crossover point for words: 1 to min_word_len - 1
266
295
  point = random.randint(1, min_word_len - 1)
267
296
  child1_words = words1[:point] + words2[point:]
268
297
  child2_words = words2[:point] + words1[point:]
269
298
 
270
- return creator.Individual(' '.join(child1_words)), creator.Individual(' '.join(child2_words))
299
+ return ' '.join(child1_words), ' '.join(child2_words)
300
+
301
+ def _deap_crossover(
302
+ self,
303
+ ind1: "creator.Individual",
304
+ ind2: "creator.Individual"
305
+ ) -> Tuple["creator.Individual", "creator.Individual"]:
306
+ """Enhanced crossover operation that preserves semantic meaning.
307
+ Attempts chunk-level crossover first, then falls back to word-level.
308
+ """
309
+ reporting.display_message(" Recombining prompts by mixing and matching words and sentences.", verbose=self.verbose)
310
+ messages_1_orig: List[Dict[Literal["role", "content"], str]] = ind1
311
+ messages_2_orig: List[Dict[Literal["role", "content"], str]] = ind2
312
+
313
+ for i, message_1 in enumerate(messages_1_orig):
314
+ role: str = message_1['role']
315
+ message_1_str: str = message_1['content']
316
+
317
+ # We check that the second message has enough AI messages and the correct role
318
+ if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]['role'] == role):
319
+ message_2 = messages_2_orig[i]
320
+ message_2_str: str = message_2['content']
321
+
322
+ try:
323
+ child1_str, child2_str = self._deap_crossover_chunking_strategy(message_1_str, message_2_str)
324
+ except ValueError:
325
+ child1_str, child2_str = self._deap_crossover_word_level(message_1_str, message_2_str)
326
+
327
+ # Update the message content
328
+ messages_1_orig[i]['content'] = child1_str
329
+ messages_2_orig[i]['content'] = child2_str
330
+ else:
331
+ # We don't perform any crossover if there are not enough messages or the roles
332
+ # don't match
333
+ pass
334
+
335
+ return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
271
336
 
272
337
  def _deap_mutation(
273
338
  self,
274
339
  individual: "creator.Individual",
275
- task_config: TaskConfig
276
- ) -> Tuple["creator.Individual",]:
277
- """Enhanced mutation operation with multiple strategies. Requires task_config for some mutations."""
278
- prompt = str(individual)
340
+ initial_prompt: chat_prompt.ChatPrompt
341
+ ) -> "creator.Individual":
342
+ """Enhanced mutation operation with multiple strategies."""
343
+ prompt = chat_prompt.ChatPrompt(messages=individual)
279
344
 
280
345
  # Choose mutation strategy based on current diversity
281
346
  diversity = self._calculate_population_diversity()
@@ -294,23 +359,29 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
294
359
 
295
360
  if mutation_choice > structural_threshold:
296
361
  # This corresponds to the original 'else' (word_level_mutation)
297
- return self._word_level_mutation(prompt)
362
+ mutated_prompt = self._word_level_mutation_prompt(prompt)
363
+ reporting.display_success(" Mutation successful, prompt has been edited by randomizing words (word-level mutation).", verbose=self.verbose)
364
+ return creator.Individual(mutated_prompt.formatted_messages)
298
365
  elif mutation_choice > semantic_threshold:
299
366
  # This corresponds to the original 'elif' (structural_mutation)
300
- return self._structural_mutation(prompt)
367
+ mutated_prompt = self._structural_mutation(prompt)
368
+ reporting.display_success(" Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).", verbose=self.verbose)
369
+ return creator.Individual(mutated_prompt.formatted_messages)
301
370
  else:
302
371
  # This corresponds to the original 'if' (semantic_mutation)
303
- return self._semantic_mutation(prompt, task_config)
372
+ mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
373
+ reporting.display_success(" Mutation successful, prompt has been edited using an LLM (semantic mutation).", verbose=self.verbose)
374
+ return creator.Individual(mutated_prompt.formatted_messages)
304
375
 
305
376
  def _semantic_mutation(
306
377
  self,
307
- prompt: str,
308
- task_config: TaskConfig
309
- ) -> Tuple["creator.Individual",]:
378
+ prompt: chat_prompt.ChatPrompt,
379
+ initial_prompt: chat_prompt.ChatPrompt
380
+ ) -> chat_prompt.ChatPrompt:
310
381
  """Enhanced semantic mutation with multiple strategies."""
311
382
  current_output_style_guidance = self.output_style_guidance
312
383
  if random.random() < 0.1:
313
- return self._radical_innovation_mutation(prompt, task_config)
384
+ return self._radical_innovation_mutation(prompt, initial_prompt)
314
385
 
315
386
  try:
316
387
  strategy = random.choice([
@@ -327,58 +398,79 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
327
398
  }
328
399
 
329
400
  user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
330
- Task context: {self._get_task_description_for_llm(task_config)}
401
+ Task context: {self._get_task_description_for_llm(initial_prompt)}
331
402
  Desired output style from target LLM: '{current_output_style_guidance}'
332
403
  Instruction for this modification: {strategy_prompts[strategy]}.
333
- Return only the modified prompt string, nothing else.
404
+ Return only the modified prompt message list, nothing else. Make sure to return a valid JSON object.
334
405
  """
335
406
  response = self._call_model(
336
- prompt=user_prompt_for_semantic_mutation,
337
- system_prompt=f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided.",
407
+ messages=[
408
+ {"role": "system", "content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided."},
409
+ {"role": "user", "content": user_prompt_for_semantic_mutation}
410
+ ],
338
411
  is_reasoning=True
339
412
  )
340
- return creator.Individual(response.strip()),
413
+
414
+ return chat_prompt.ChatPrompt(messages=utils.json_to_dict(response.strip()))
341
415
  except Exception as e:
342
- logger.warning(f"Error in semantic mutation for prompt '{prompt[:50]}...': {e}")
343
- return creator.Individual(prompt),
416
+ reporting.display_error(f" Error in semantic mutation, this is usually a parsing error: {e}", verbose=self.verbose)
417
+ return prompt
344
418
 
345
419
  def _structural_mutation(
346
420
  self,
347
- prompt: str
348
- ) -> Tuple["creator.Individual",]:
421
+ prompt: chat_prompt.ChatPrompt
422
+ ) -> chat_prompt.ChatPrompt:
349
423
  """Perform structural mutation (reordering, combining, splitting)."""
350
- sentences = [s.strip() for s in prompt.split('.') if s.strip()]
351
- if len(sentences) <= 1:
352
- return self._word_level_mutation(prompt)
353
-
354
- mutation_type = random.random()
355
- if mutation_type < 0.3:
356
- # Reorder sentences
357
- random.shuffle(sentences)
358
- return creator.Individual('. '.join(sentences) + '.'),
359
- elif mutation_type < 0.6:
360
- # Combine adjacent sentences
361
- if len(sentences) >= 2:
362
- idx = random.randint(0, len(sentences) - 2)
363
- combined = sentences[idx] + ' and ' + sentences[idx + 1]
364
- sentences[idx:idx+2] = [combined]
365
- return creator.Individual('. '.join(sentences) + '.'),
366
- else:
367
- # Split a sentence
368
- idx = random.randint(0, len(sentences) - 1)
369
- words = sentences[idx].split()
370
- if len(words) > 3:
371
- split_point = random.randint(2, len(words) - 2)
372
- sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
373
- return creator.Individual('. '.join(sentences) + '.'),
374
-
375
- return creator.Individual(prompt),
424
+ mutated_messages: List[Dict[Literal["role", "content"], str]] = []
376
425
 
377
- def _word_level_mutation(self, prompt: str) -> Tuple["creator.Individual",]:
426
+ for message in prompt.formatted_messages:
427
+ content = message["content"]
428
+ role = message["role"]
429
+
430
+ sentences = [s.strip() for s in content.split('.') if s.strip()]
431
+ if len(sentences) <= 1:
432
+ mutated_messages.append({"role": role, "content": self._word_level_mutation(content)})
433
+ continue
434
+
435
+ mutation_type = random.random()
436
+ if mutation_type < 0.3:
437
+ # Reorder sentences
438
+ random.shuffle(sentences)
439
+ mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
440
+ continue
441
+ elif mutation_type < 0.6:
442
+ # Combine adjacent sentences
443
+ if len(sentences) >= 2:
444
+ idx = random.randint(0, len(sentences) - 2)
445
+ combined = sentences[idx] + ' and ' + sentences[idx + 1]
446
+ sentences[idx:idx+2] = [combined]
447
+ mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
448
+ continue
449
+ else:
450
+ # Split a sentence
451
+ idx = random.randint(0, len(sentences) - 1)
452
+ words = sentences[idx].split()
453
+ if len(words) > 3:
454
+ split_point = random.randint(2, len(words) - 2)
455
+ sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
456
+ mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
457
+ continue
458
+ else:
459
+ mutated_messages.append({"role": role, "content": content})
460
+
461
+ return chat_prompt.ChatPrompt(messages=mutated_messages)
462
+
463
+ def _word_level_mutation_prompt(self, prompt: chat_prompt.ChatPrompt) -> chat_prompt.ChatPrompt:
464
+ mutated_messages: List[Dict[Literal['role', 'content'], str]] = []
465
+ for message in prompt.formatted_messages:
466
+ mutated_messages.append({"role": message["role"], "content": self._word_level_mutation(message["content"])})
467
+ return chat_prompt.ChatPrompt(messages=mutated_messages)
468
+
469
+ def _word_level_mutation(self, msg_content: str) -> str:
378
470
  """Perform word-level mutation."""
379
- words = prompt.split()
471
+ words = msg_content.split()
380
472
  if len(words) <= 1:
381
- return creator.Individual(prompt),
473
+ return msg_content
382
474
 
383
475
  mutation_type = random.random()
384
476
  if mutation_type < 0.3:
@@ -395,7 +487,7 @@ Return only the modified prompt string, nothing else.
395
487
  idx = random.randint(0, len(words) - 1)
396
488
  words[idx] = self._modify_phrase(words[idx])
397
489
 
398
- return creator.Individual(' '.join(words)),
490
+ return ' '.join(words)
399
491
 
400
492
  def _get_synonym(
401
493
  self,
@@ -404,8 +496,10 @@ Return only the modified prompt string, nothing else.
404
496
  """Get a synonym for a word using LLM."""
405
497
  try:
406
498
  response = self._call_model(
407
- prompt=f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else.",
408
- system_prompt="You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text.",
499
+ messages=[
500
+ {"role": "system", "content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text."},
501
+ {"role": "user", "content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else."}
502
+ ],
409
503
  is_reasoning=True
410
504
  )
411
505
  return response.strip()
@@ -420,8 +514,10 @@ Return only the modified prompt string, nothing else.
420
514
  """Modify a phrase while preserving meaning using LLM."""
421
515
  try:
422
516
  response = self._call_model(
423
- prompt=f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else.",
424
- system_prompt="You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text.",
517
+ messages=[
518
+ {"role": "system", "content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text."},
519
+ {"role": "user", "content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else."}
520
+ ],
425
521
  is_reasoning=True
426
522
  )
427
523
  return response.strip()
@@ -431,12 +527,12 @@ Return only the modified prompt string, nothing else.
431
527
 
432
528
  def _radical_innovation_mutation(
433
529
  self,
434
- prompt_str: str,
435
- task_config: TaskConfig
436
- ) -> Tuple["creator.Individual",]:
530
+ prompt: chat_prompt.ChatPrompt,
531
+ initial_prompt: chat_prompt.ChatPrompt
532
+ ) -> chat_prompt.ChatPrompt:
437
533
  """Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
438
- logger.debug(f"Attempting radical innovation for prompt: {prompt_str[:70]}...")
439
- task_desc_for_llm = self._get_task_description_for_llm(task_config)
534
+ logger.debug(f"Attempting radical innovation for prompt: {json.dumps(prompt.formatted_messages)[:70]}...")
535
+ task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
440
536
  current_output_style_guidance = self.output_style_guidance
441
537
 
442
538
  user_prompt_for_radical_innovation = f"""Task Context:
@@ -444,225 +540,307 @@ Return only the modified prompt string, nothing else.
444
540
  Desired output style from target LLM: '{current_output_style_guidance}'
445
541
 
446
542
  Existing Prompt (which may be underperforming):
447
- '''{prompt_str}'''
543
+ '''{prompt.formatted_messages}'''
448
544
 
449
545
  Please generate a new, significantly improved, and potentially very different prompt for this task.
450
546
  Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
451
- Return only the new prompt string.
547
+ Return only the new prompt list object.
452
548
  """
453
549
  try:
454
550
  new_prompt_str = self._call_model(
455
- prompt=user_prompt_for_radical_innovation,
456
- system_prompt=self.get_radical_innovation_system_prompt(),
551
+ messages=[
552
+ {"role": "system", "content": self._get_radical_innovation_system_prompt()},
553
+ {"role": "user", "content": user_prompt_for_radical_innovation}
554
+ ],
457
555
  is_reasoning=True
458
556
  )
459
- logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {prompt_str[:70]}...")
460
- return creator.Individual(new_prompt_str.strip()),
557
+ logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.formatted_messages)[:70]}...")
558
+ return chat_prompt.ChatPrompt(messages=json.loads(new_prompt_str))
461
559
  except Exception as e:
462
- logger.warning(f"Radical innovation mutation failed for prompt '{prompt_str[:50]}...': {e}. Returning original.")
463
- return creator.Individual(prompt_str),
560
+ logger.warning(f"Radical innovation mutation failed for prompt '{json.dumps(prompt.formatted_messages)[:50]}...': {e}. Returning original.")
561
+ return prompt
464
562
 
465
563
  def _initialize_population(
466
564
  self,
467
- initial_prompt: str,
468
- task_config: TaskConfig,
469
- ) -> List[str]:
565
+ prompt: chat_prompt.ChatPrompt
566
+ ) -> List[chat_prompt.ChatPrompt]:
470
567
  """Initialize the population with diverse variations of the initial prompt,
471
568
  including some 'fresh start' prompts based purely on task description.
472
569
  All generated prompts should aim to elicit answers matching self.output_style_guidance.
473
570
  """
474
- population = [initial_prompt]
475
- if self.population_size <= 1:
476
- return population
571
+ with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
572
+ init_pop_report.start(self.population_size)
573
+
574
+ population = [prompt]
575
+ if self.population_size <= 1:
576
+ return population
577
+
578
+ num_to_generate_total = self.population_size - 1
579
+ num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
580
+ num_variations_on_initial = num_to_generate_total - num_fresh_starts
581
+
582
+ task_desc_for_llm = self._get_task_description_for_llm(prompt)
583
+ current_output_style_guidance = self.output_style_guidance
584
+
585
+ # Generate "fresh start" prompts if the initial prompt is not performing well
586
+ # Cold start prompts are generated from the task description
587
+ if num_fresh_starts > 0:
588
+ init_pop_report.start_fresh_prompts(num_fresh_starts)
589
+ fresh_start_user_prompt = f"""Here is a description of a task:
590
+ {task_desc_for_llm}
591
+
592
+ The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
593
+
594
+ Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
595
+ Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
596
+
597
+ Example of valid response: [
598
+ ["role": "<role>", "content": "<Prompt targeting specified style.>"],
599
+ ["role": "<role>", "content": "<Another prompt designed for the output style.>"]
600
+ ]
477
601
 
478
- num_to_generate_total = self.population_size - 1
479
- num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
480
- num_variations_on_initial = num_to_generate_total - num_fresh_starts
602
+ Your response MUST be a valid JSON list of AI messages. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
603
+
604
+ """
605
+ try:
606
+ response_content = self._call_model(
607
+ messages=[
608
+ {"role": "system", "content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings."},
609
+ {"role": "user", "content": fresh_start_user_prompt}
610
+ ],
611
+ is_reasoning=True
612
+ )
613
+
614
+ logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
615
+
616
+ fresh_prompts = utils.json_to_dict(response_content)
617
+ if isinstance(fresh_prompts, list):
618
+ if all(isinstance(p, dict) for p in fresh_prompts) and all(p.get("role") is not None for p in fresh_prompts):
619
+ population.append(chat_prompt.ChatPrompt(messages=fresh_prompts))
620
+ init_pop_report.success_fresh_prompts(1)
621
+ elif all(isinstance(p, list) for p in fresh_prompts):
622
+ population.extend([chat_prompt.ChatPrompt(messages=p) for p in fresh_prompts[:num_fresh_starts]])
623
+ init_pop_report.success_fresh_prompts(len(fresh_prompts[:num_fresh_starts]))
624
+ else:
625
+ init_pop_report.failed_fresh_prompts(
626
+ num_fresh_starts,
627
+ f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts."
628
+ )
629
+ except json.JSONDecodeError as e_json:
630
+ init_pop_report.failed_fresh_prompts(
631
+ num_fresh_starts,
632
+ f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts."
633
+ )
634
+ except Exception as e:
635
+ init_pop_report.failed_fresh_prompts(
636
+ num_fresh_starts,
637
+ f"Error generating fresh start prompts: {e}. Skipping fresh start prompts."
638
+ )
639
+
640
+ # Generate variations on the initial prompt for the remaining slots
641
+ # TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
642
+ if num_variations_on_initial > 0:
643
+ init_pop_report.start_variations(num_variations_on_initial)
644
+
645
+ # TODO: We need to split this into batches as the model will not return enough tokens
646
+ # to generate all the candidates
647
+ user_prompt_for_variation = f"""Initial prompt:
648
+ '''{prompt.formatted_messages}'''
649
+
650
+ Task context:
651
+ {task_desc_for_llm}
652
+ Desired output style from target LLM: '{current_output_style_guidance}'
653
+
654
+ Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
655
+ All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
656
+ For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
657
+
658
+ Return a JSON array of prompts with the following structure:
659
+ {{
660
+ "prompts": [
661
+ {{
662
+ "prompt": [{{"role": "<role>", "content": "<content>"}}],
663
+ "strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
664
+ }}
665
+ // ... more prompts if num_variations_on_initial > 1
666
+ ]
667
+ }}
668
+ Ensure a good mix of variations, all targeting the specified output style from the end LLM.
669
+
670
+ Return a valid JSON object that is correctly escaped. Return nothing else, d`o not include any additional text or Markdown formatting.
671
+ """
672
+ try:
673
+ response_content_variations = self._call_model(
674
+ messages=[
675
+ {"role": "system", "content": self._get_reasoning_system_prompt_for_variation()},
676
+ {"role": "user", "content": user_prompt_for_variation}
677
+ ],
678
+ is_reasoning=True
679
+ )
680
+ logger.debug(f"Raw response for population variations: {response_content_variations}")
681
+ json_response_variations = json.loads(response_content_variations)
682
+ generated_prompts_variations = [p["prompt"] for p in json_response_variations.get("prompts", []) if isinstance(p, dict) and "prompt" in p]
683
+
684
+ if generated_prompts_variations:
685
+ init_pop_report.success_variations(len(generated_prompts_variations[:num_variations_on_initial]))
686
+ population.extend([chat_prompt.ChatPrompt(messages=p) for p in generated_prompts_variations[:num_variations_on_initial]])
687
+ else:
688
+ init_pop_report.failed_variations(num_variations_on_initial, "Could not parse 'prompts' list for variations. Skipping variations.")
689
+ except Exception as e:
690
+ init_pop_report.failed_variations(num_variations_on_initial, f"Error calling LLM for initial population variations: {e}")
691
+
692
+ # Ensure population is of the required size using unique prompts
693
+ # TODO Test with levenshtein distance
694
+ final_population_set: Set[str] = set()
695
+ final_population_list: List[chat_prompt.ChatPrompt] = []
696
+ for p in population:
697
+ if json.dumps(p.formatted_messages) not in final_population_set:
698
+ final_population_set.add(json.dumps(p.formatted_messages))
699
+ final_population_list.append(p)
700
+
701
+ init_pop_report.end(final_population_list)
702
+ # Return exactly population_size prompts if possible, or fewer if generation failed badly.
703
+ return final_population_list[:self.population_size]
481
704
 
482
- task_desc_for_llm = self._get_task_description_for_llm(task_config)
483
- current_output_style_guidance = self.output_style_guidance
484
705
 
485
- # Generate "fresh start" prompts if the initial prompt is not performing well
486
- # Cold start prompts are generated from the task description
487
- if num_fresh_starts > 0:
488
- logger.info(f"Generating {num_fresh_starts} 'fresh start' prompts based on task description (aiming for style: '{current_output_style_guidance[:30]}...')...")
489
- fresh_start_user_prompt = f"""Here is a description of a task:
490
- {task_desc_for_llm}
706
+ def _should_restart_population(self, curr_best: float) -> bool:
707
+ """
708
+ Update internal counters and decide if we should trigger
709
+ a population restart based on lack of improvement.
710
+ """
711
+ if self._best_primary_score_history:
712
+ threshold = self._best_primary_score_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD)
713
+ if curr_best < threshold:
714
+ self._gens_since_pop_improvement += 1
715
+ else:
716
+ self._gens_since_pop_improvement = 0
717
+ self._best_primary_score_history.append(curr_best)
718
+ return self._gens_since_pop_improvement >= self.DEFAULT_RESTART_GENERATIONS
491
719
 
492
- The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
720
+ def _restart_population(
721
+ self,
722
+ hof: tools.HallOfFame,
723
+ population: list["creator.Individual"],
724
+ best_prompt_so_far: chat_prompt.ChatPrompt,
725
+ ) -> list["creator.Individual"]:
726
+ """Return a fresh, evaluated population seeded by elites."""
727
+ if self.enable_moo:
728
+ elites = list(hof)
729
+ else:
730
+ elites = tools.selBest(population, self.elitism_size)
493
731
 
494
- Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
495
- Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
496
- Your response MUST be a valid JSON list of strings. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
497
- Example of valid response: ["Prompt targeting specified style.", "Another prompt designed for the output style."]
498
- """
499
- try:
500
- response_content = self._call_model(
501
- prompt=fresh_start_user_prompt,
502
- system_prompt=f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
503
- is_reasoning=True
504
- )
505
- logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
506
-
507
- cleaned_response_content = response_content.strip()
508
- if cleaned_response_content.startswith("```json"):
509
- cleaned_response_content = cleaned_response_content[7:]
510
- if cleaned_response_content.endswith("```"):
511
- cleaned_response_content = cleaned_response_content[:-3]
512
- elif cleaned_response_content.startswith("```"):
513
- cleaned_response_content = cleaned_response_content[3:]
514
- if cleaned_response_content.endswith("```"):
515
- cleaned_response_content = cleaned_response_content[:-3]
516
- cleaned_response_content = cleaned_response_content.strip()
517
-
518
- fresh_prompts = json.loads(cleaned_response_content)
519
- if isinstance(fresh_prompts, list) and all(isinstance(p, str) for p in fresh_prompts) and fresh_prompts:
520
- population.extend(fresh_prompts[:num_fresh_starts])
521
- logger.info(f"Generated {len(fresh_prompts[:num_fresh_starts])} fresh prompts from LLM.")
522
- else:
523
- logger.warning(f"LLM response for fresh starts was not a valid list of strings or was empty: {cleaned_response_content}. Using fallbacks for fresh starts.")
524
- population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
525
- except json.JSONDecodeError as e_json:
526
- logger.warning(f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response (after cleaning): '{cleaned_response_content if 'cleaned_response_content' in locals() else response_content}'. Using fallbacks for fresh starts.")
527
- population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
528
- except Exception as e:
529
- logger.warning(f"Error generating fresh start prompts: {e}. Using fallbacks for fresh starts.")
530
- population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
732
+ seed_prompt = (
733
+ chat_prompt.ChatPrompt(messages=max(elites, key=lambda x: x.fitness.values[0]))
734
+ if elites else best_prompt_so_far
735
+ )
531
736
 
532
- # Generate variations on the initial prompt for the remaining slots
533
- # TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
534
- if num_variations_on_initial > 0:
535
- logger.info(f"Generating {num_variations_on_initial} variations of the initial prompt (aiming for style: '{current_output_style_guidance[:30]}...')...")
536
- user_prompt_for_variation = f"""Initial prompt:
537
- '''{initial_prompt}'''
737
+ prompt_variants = self._initialize_population(seed_prompt)
738
+ new_pop = [creator.Individual(p.formatted_messages) for p in prompt_variants]
538
739
 
539
- Task context:
540
- {task_desc_for_llm}
541
- Desired output style from target LLM: '{current_output_style_guidance}'
740
+ for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
741
+ ind.fitness.values = fit
542
742
 
543
- Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
544
- All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
545
- For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
743
+ self._gens_since_pop_improvement = 0
744
+ return new_pop
546
745
 
547
- Return a JSON array of prompts with the following structure:
548
- {{
549
- "prompts": [
550
- {{
551
- "prompt": "alternative prompt 1 designed for the specified output style",
552
- "strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
553
- }}
554
- // ... more prompts if num_variations_on_initial > 1
555
- ]
556
- }}
557
- Ensure a good mix of variations, all targeting the specified output style from the end LLM.
558
- """
559
- try:
560
- response_content_variations = self._call_model(
561
- prompt=user_prompt_for_variation,
562
- system_prompt=self.get_reasoning_system_prompt_for_variation(),
563
- is_reasoning=True
564
- )
565
- logger.debug(f"Raw response for population variations: {response_content_variations}")
566
- json_response_variations = json.loads(response_content_variations)
567
- generated_prompts_variations = [p["prompt"] for p in json_response_variations.get("prompts", []) if isinstance(p, dict) and "prompt" in p]
568
- if generated_prompts_variations:
569
- population.extend(generated_prompts_variations[:num_variations_on_initial])
570
- logger.info(f"Successfully parsed {len(generated_prompts_variations[:num_variations_on_initial])} variations from LLM response.")
571
- else:
572
- logger.warning("Could not parse 'prompts' list for variations. Using fallback for remaining.")
573
- population.extend(self._generate_fallback_variations(initial_prompt, num_variations_on_initial))
574
- except Exception as e:
575
- logger.error(f"Error calling LLM for initial population variations: {e}. Using fallback for remaining.")
576
- population.extend(self._generate_fallback_variations(initial_prompt, num_variations_on_initial))
577
-
578
- # Ensure population is of the required size using unique prompts
579
- # TODO Test with levenshtein distance
580
- final_population_set = set()
581
- final_population_list = []
582
- for p in population:
583
- if p not in final_population_set:
584
- final_population_set.add(p)
585
- final_population_list.append(p)
586
-
587
- # If not enough unique prompts, fill with fallbacks (could be more sophisticated)
588
- while len(final_population_list) < self.population_size and len(final_population_list) < num_to_generate_total +1:
589
- fallback_prompt = initial_prompt + f" #fallback{len(final_population_list)}"
590
- if fallback_prompt not in final_population_set:
591
- final_population_list.append(fallback_prompt)
592
- final_population_set.add(fallback_prompt)
593
- else:
594
- # Safeguard if initial_prompt itself is causing issues with uniqueness
595
- fallback_prompt = f"Fallback prompt variation {random.randint(1000,9999)}"
596
- if fallback_prompt not in final_population_set:
597
- final_population_list.append(fallback_prompt)
598
- final_population_set.add(fallback_prompt)
599
- # Avoid infinite loop in extreme edge case
600
- else: break
601
-
602
- logger.info(f"Initialized population with {len(final_population_list)} prompts.")
603
- # Return exactly population_size prompts if possible, or fewer if generation failed badly.
604
- return final_population_list[:self.population_size]
605
-
606
- def _generate_diverse_variation(
607
- self,
608
- base_prompt: str,
609
- seen_prompts: set
610
- ) -> str:
611
- """Generate a new variation that's different from existing ones."""
612
- max_attempts = 5
613
- for _ in range(max_attempts):
614
- # Try different mutation strategies
615
- mutation_choice = random.random()
616
- if mutation_choice < 0.3:
617
- new_prompt = self._semantic_mutation(base_prompt)[0]
618
- elif mutation_choice < 0.6:
619
- new_prompt = self._structural_mutation(base_prompt)[0]
620
- else:
621
- new_prompt = self._word_level_mutation(base_prompt)[0]
622
-
623
- # Check if this variation is sufficiently different
624
- is_diverse = True
625
- for existing in seen_prompts:
626
- if Levenshtein.distance(str(new_prompt), existing) / max(len(str(new_prompt)), len(existing)) < 0.3:
627
- is_diverse = False
628
- break
629
- if is_diverse:
630
- return str(new_prompt)
746
+ def _run_generation(
747
+ self,
748
+ generation_idx: int,
749
+ population: list["creator.Individual"],
750
+ prompt: chat_prompt.ChatPrompt,
751
+ hof: tools.HallOfFame,
752
+ report: Any,
753
+ best_primary_score_overall: float,
754
+ ) -> tuple[list["creator.Individual"], int]:
755
+ """Execute mating, mutation, evaluation and HoF update."""
756
+ best_gen_score = 0.0
757
+
758
+ # --- selection -------------------------------------------------
759
+ if self.enable_moo:
760
+ offspring = self.toolbox.select(population, self.population_size)
761
+ else:
762
+ elites = tools.selBest(population, self.elitism_size)
763
+ rest = self.toolbox.select(population, len(population) - self.elitism_size)
764
+ offspring = elites + rest
765
+
766
+ # --- crossover -------------------------------------------------
767
+ report.performing_crossover()
768
+ offspring = list(map(self.toolbox.clone, offspring))
769
+ for i in range(0, len(offspring), 2):
770
+ if i+1 < len(offspring):
771
+ c1, c2 = offspring[i], offspring[i+1]
772
+ if random.random() < self.crossover_rate:
773
+ c1_new, c2_new = self.toolbox.mate(c1, c2)
774
+ offspring[i], offspring[i+1] = c1_new, c2_new
775
+ del offspring[i].fitness.values, offspring[i+1].fitness.values
776
+ reporting.display_success(" Crossover successful, prompts have been combined and edited.\n│", verbose=self.verbose)
777
+
778
+ # --- mutation --------------------------------------------------
779
+ report.performing_mutation()
780
+ mut_rate = self._get_adaptive_mutation_rate()
781
+ n_mutations = 0
782
+ for i, ind in enumerate(offspring):
783
+ if random.random() < mut_rate:
784
+ new_ind = self.toolbox.mutate(ind, initial_prompt=prompt)
785
+ offspring[i] = new_ind
786
+ del offspring[i].fitness.values
787
+ n_mutations += 1
788
+ reporting.display_success(f" Mutation successful, {n_mutations} prompts have been edited.\n│", verbose=self.verbose)
631
789
 
632
- # If we couldn't generate a diverse variation, create a simple one
633
- return base_prompt + f" #v{len(seen_prompts)}"
790
+ # --- evaluation ------------------------------------------------
791
+ invalid = [ind for ind in offspring if not ind.fitness.valid]
792
+ report.performing_evaluation(len(invalid))
793
+ for ind_idx, ind in enumerate(invalid):
794
+ fit = self.toolbox.evaluate(ind)
795
+ ind.fitness.values = fit
796
+ best_gen_score = max(best_gen_score, fit[0])
634
797
 
635
- def _generate_fallback_variations(
636
- self,
637
- initial_prompt: str,
638
- num_variations: int
639
- ) -> List[str]:
640
- """Generate fallback variations when LLM generation fails."""
641
- variations = []
642
- words = initial_prompt.split()
798
+ report.performed_evaluation(ind_idx, ind.fitness.values[0])
643
799
 
644
- for i in range(num_variations):
645
- if len(words) > 3:
646
- # Shuffle words
647
- shuffled = words.copy()
648
- random.shuffle(shuffled)
649
- variations.append(' '.join(shuffled))
650
- else:
651
- # Add simple variations
652
- variations.append(initial_prompt + f" #v{i}")
800
+ # --- update HoF & reporter ------------------------------------
801
+ hof.update(offspring)
802
+ reporting.end_gen(generation_idx, best_gen_score, best_primary_score_overall, verbose=self.verbose)
653
803
 
654
- return variations
804
+ return offspring, len(invalid)
805
+
806
+ def _population_best_score(self, population: List["creator.Individual"]) -> float:
807
+ """Return highest primary-objective score among *valid* individuals."""
808
+ valid_scores = [ind.fitness.values[0] for ind in population if ind.fitness.valid]
809
+ return max(valid_scores, default=0.0)
655
810
 
656
811
  def optimize_prompt(
657
812
  self,
658
- dataset: Union[str, opik.Dataset],
659
- metric_config: MetricConfig,
660
- task_config: TaskConfig,
813
+ prompt: chat_prompt.ChatPrompt,
814
+ dataset: opik.Dataset,
815
+ metric: Callable,
661
816
  experiment_config: Optional[Dict] = None,
662
817
  n_samples: Optional[int] = None,
663
818
  auto_continue: bool = False,
664
819
  **kwargs,
665
820
  ) -> OptimizationResult:
821
+ """
822
+ Args:
823
+ prompt: The prompt to optimize
824
+ dataset: The dataset to use for evaluation
825
+ metric: Metric function to optimize with, should have the arguments `dataset_item` and `llm_output`
826
+ experiment_config: Optional experiment configuration
827
+ n_samples: Optional number of samples to use
828
+ auto_continue: Whether to automatically continue optimization
829
+ **kwargs: Additional keyword arguments
830
+ """
831
+ reporting.display_header(self.__class__.__name__, verbose=self.verbose)
832
+ reporting.display_configuration(
833
+ prompt.formatted_messages,
834
+ {
835
+ "optimizer": f"{ 'DEAP MOO' if self.enable_moo else 'DEAP SO' } Evolutionary Optimization",
836
+ "population_size": self.population_size,
837
+ "generations": self.num_generations,
838
+ "mutation_rate": self.mutation_rate,
839
+ "crossover_rate": self.crossover_rate,
840
+ },
841
+ verbose=self.verbose
842
+ )
843
+
666
844
  self.llm_call_counter = 0
667
845
  self._history = []
668
846
  self._current_optimization_id = None
@@ -672,145 +850,120 @@ Ensure a good mix of variations, all targeting the specified output style from t
672
850
  self._llm_cache.clear()
673
851
  self._current_population = []
674
852
  self._generations_without_overall_improvement = 0
675
-
676
- # Determine final output_style_guidance
677
- effective_output_style_guidance = self.output_style_guidance
678
- if self.infer_output_style and \
679
- (self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
680
- # If user wants inference AND hasn't provided a specific custom guidance
681
- inferred_style = self._infer_output_style_from_dataset(dataset, task_config)
682
- if inferred_style:
683
- effective_output_style_guidance = inferred_style
684
- # Update self.output_style_guidance for this run so dynamic prompt methods use it
685
- self.output_style_guidance = inferred_style
686
- else:
687
- logger.warning("Failed to infer output style, using default or user-provided guidance.")
688
853
 
689
- # Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
690
- # (It might have been None if user passed None and infer_output_style was False)
691
- if self.output_style_guidance is None:
692
- # Fallback if still None
693
- self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
694
-
695
- # The methods like get_reasoning_system_prompt_for_variation will now use the potentially updated self.output_style_guidance
696
- log_prefix = "DEAP MOO" if self.enable_moo else "DEAP SO"
697
- logger.info(f"Starting {log_prefix} Evolutionary Optimization for prompt: {task_config.instruction_prompt[:100]}...")
698
- logger.info(f"Population: {self.population_size}, Generations: {self.num_generations}, Mutation: {self.mutation_rate}, Crossover: {self.crossover_rate}")
699
-
700
- opik_dataset_obj: opik.Dataset
701
- if isinstance(dataset, str):
702
- opik_dataset_obj = self._opik_client.get_dataset(dataset)
703
- else:
704
- opik_dataset_obj = dataset
705
-
706
- opik_optimization_run = None
707
- try:
708
- opik_optimization_run = self._opik_client.create_optimization(
709
- dataset_name=opik_dataset_obj.name,
710
- objective_name=metric_config.metric.name,
711
- metadata={"optimizer": self.__class__.__name__},
712
- )
713
- self._current_optimization_id = opik_optimization_run.id
714
- logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
715
- except Exception as e:
716
- logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
717
-
718
- # Use of multi-objective fitness function or single-objective fitness function
854
+ # Step 0. Define fitness function
719
855
  if self.enable_moo:
720
856
  def _deap_evaluate_individual_fitness(
721
- individual_prompt_str: str
857
+ messages: List[Dict[str, str]]
722
858
  ) -> Tuple[float, float]:
723
- primary_fitness_score = self.evaluate_prompt(
724
- dataset=opik_dataset_obj, metric_config=metric_config, task_config=task_config,
725
- prompt=str(individual_prompt_str), n_samples=n_samples,
859
+ primary_fitness_score: float = self.evaluate_prompt(
860
+ prompt=chat_prompt.ChatPrompt(messages=messages),
861
+ dataset=dataset,
862
+ metric=metric,
863
+ n_samples=n_samples,
726
864
  experiment_config=(experiment_config or {}).copy(),
727
- optimization_id=self._current_optimization_id, verbose=0
865
+ optimization_id=self._current_optimization_id,
866
+ verbose=0
728
867
  )
729
- prompt_length = float(len(str(individual_prompt_str)))
730
- logger.debug(f"Evaluated MOO individual '{str(individual_prompt_str)[:50]}...' -> Primary Score: {primary_fitness_score:.4f}, Length: {prompt_length}")
868
+ prompt_length = float(len(str(json.dumps(messages))))
731
869
  return (primary_fitness_score, prompt_length)
732
870
  else:
733
871
  # Single-objective
734
872
  def _deap_evaluate_individual_fitness(
735
- individual_prompt_str: str
873
+ messages: List[Dict[str, str]]
736
874
  ) -> Tuple[float,]:
737
- fitness_score = self.evaluate_prompt(
738
- dataset=opik_dataset_obj, metric_config=metric_config, task_config=task_config,
739
- prompt=str(individual_prompt_str), n_samples=n_samples,
875
+ fitness_score: float = self.evaluate_prompt(
876
+ prompt=chat_prompt.ChatPrompt(messages=messages),
877
+ dataset=dataset,
878
+ metric=metric,
879
+ n_samples=n_samples,
740
880
  experiment_config=(experiment_config or {}).copy(),
741
- optimization_id=self._current_optimization_id, verbose=0
881
+ optimization_id=self._current_optimization_id,
882
+ verbose=0
742
883
  )
743
- logger.debug(f"Evaluated SO individual '{str(individual_prompt_str)[:50]}...' -> Score: {fitness_score:.4f}")
744
884
  return (fitness_score,)
745
-
746
- # Register the fitness function with DEAP
747
885
  self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
748
886
 
749
- initial_prompt_strings = self._initialize_population(
750
- initial_prompt=task_config.instruction_prompt, task_config=task_config
751
- )
752
- deap_population = [creator.Individual(p_str) for p_str in initial_prompt_strings]
753
- deap_population = deap_population[:self.population_size]
887
+ # Step 1. Start Opik optimization run
888
+ opik_optimization_run: Optional[optimization.Optimization] = None
889
+ try:
890
+ opik_optimization_run: optimization.Optimization = self._opik_client.create_optimization(
891
+ dataset_name=dataset.name,
892
+ objective_name=metric.__name__,
893
+ metadata={"optimizer": self.__class__.__name__},
894
+ )
895
+ self._current_optimization_id = opik_optimization_run.id
896
+ logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
897
+ except Exception as e:
898
+ logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
754
899
 
755
- initial_eval_result = _deap_evaluate_individual_fitness(task_config.instruction_prompt)
756
- initial_primary_score = initial_eval_result[0]
757
- initial_length = initial_eval_result[1] if self.enable_moo else float(len(task_config.instruction_prompt))
900
+ # Step 2. Compute the initial performance of the prompt
901
+ with reporting.baseline_performance(verbose=self.verbose) as report_baseline_performance:
902
+ initial_eval_result: Tuple[float, float] | Tuple[float, ] = _deap_evaluate_individual_fitness(prompt.formatted_messages)
903
+ initial_primary_score: float = initial_eval_result[0]
904
+ initial_length: float = initial_eval_result[1] if self.enable_moo else float(len(json.dumps(prompt.formatted_messages)))
905
+
906
+ best_primary_score_overall: float = initial_primary_score
907
+ best_prompt_overall = prompt
908
+ report_baseline_performance.set_score(initial_primary_score)
758
909
 
759
- best_primary_score_overall = initial_primary_score
760
- best_prompt_overall = task_config.instruction_prompt
761
- if self.enable_moo:
762
- logger.info(f"Initial prompt '{task_config.instruction_prompt[:100]}...' -> Primary Score: {initial_primary_score:.4f}, Length: {initial_length}")
763
- else:
764
- logger.info(f"Initial prompt '{task_config.instruction_prompt[:100]}...' score: {initial_primary_score:.4f}")
910
+ # Step 3. Define the output style guide
911
+ effective_output_style_guidance = self.output_style_guidance
912
+ if self.infer_output_style and \
913
+ (self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
914
+ # If user wants inference AND hasn't provided a specific custom guidance
915
+ inferred_style = self._infer_output_style_from_dataset(dataset, prompt)
916
+ if inferred_style:
917
+ effective_output_style_guidance = inferred_style
918
+ # Update self.output_style_guidance for this run so dynamic prompt methods use it
919
+ self.output_style_guidance = inferred_style
920
+ else:
921
+ logger.warning("Failed to infer output style, using default or user-provided guidance.")
765
922
 
766
- # Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
923
+ # Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
924
+ # (It might have been None if user passed None and infer_output_style was False)
925
+ if self.output_style_guidance is None:
926
+ # Fallback if still None
927
+ self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
928
+
929
+ # Step 4. Initialize population
930
+ initial_prompts: List[chat_prompt.ChatPrompt] = self._initialize_population(
931
+ prompt=prompt
932
+ )
933
+
934
+ deap_population = [creator.Individual(p.formatted_messages) for p in initial_prompts]
935
+ deap_population = deap_population[:self.population_size]
936
+
937
+ # Step 5. Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
767
938
  if self.enable_moo:
768
939
  hof = tools.ParetoFront()
769
- stats_primary = tools.Statistics(lambda ind: ind.fitness.values[0])
770
- stats_length = tools.Statistics(lambda ind: ind.fitness.values[1])
771
- stats_primary.register("avg_score", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
772
- stats_primary.register("max_score", max)
773
- stats_length.register("avg_len", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
774
- stats_length.register("min_len", min)
775
- mstats = tools.MultiStatistics(score=stats_primary, length=stats_length)
776
- logbook_header_stats = mstats.fields
777
940
  else:
778
941
  # Single-objective
779
942
  hof = tools.HallOfFame(self.DEFAULT_HALL_OF_FAME_SIZE)
780
- stats = tools.Statistics(lambda ind: ind.fitness.values[0])
781
- stats.register("avg", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
782
- stats.register("std", lambda x: (sum((xi - (sum(x) / len(x) if len(x) > 0 else 0))**2 for xi in x) / len(x))**0.5 if len(x) > 1 else 0)
783
- stats.register("min", min)
784
- stats.register("max", max)
785
- logbook_header_stats = stats.fields
786
-
787
- logbook = tools.Logbook()
788
- logbook.header = ["gen", "evals"] + logbook_header_stats
789
943
 
790
- # Evaluate the initial population
791
- fitnesses = list(map(self.toolbox.evaluate, deap_population))
792
- for ind, fit in zip(deap_population, fitnesses):
793
- ind.fitness.values = fit
944
+ # Step 6. Evaluate the initial population
945
+ with reporting.evaluate_initial_population(verbose=self.verbose) as report_initial_population:
946
+ fitnesses: List[float] = list(map(self.toolbox.evaluate, deap_population))
947
+ _best_score = max(best_primary_score_overall, max([x[0] for x in fitnesses]))
948
+
949
+ for i, ind, fit in zip(range(len(deap_population)), deap_population, fitnesses):
950
+ ind.fitness.values = fit
951
+ report_initial_population.set_score(i, fit[0], _best_score)
794
952
 
795
953
  hof.update(deap_population)
796
- record_stats = mstats if self.enable_moo else stats
797
- record = record_stats.compile(deap_population) if record_stats else {}
798
- logbook.record(gen=0, evals=len(deap_population), **record)
799
- if self.verbose >= 1:
800
- print(logbook.stream)
801
-
954
+
802
955
  if hof and len(hof) > 0:
803
956
  if self.enable_moo:
804
- current_best_for_primary = max(hof, key=lambda ind: ind.fitness.values[0])
805
- best_primary_score_overall = current_best_for_primary.fitness.values[0]
806
- best_prompt_overall = str(current_best_for_primary)
957
+ current_best_for_primary: creator.Individual = max(hof, key=lambda ind: ind.fitness.values[0])
958
+ best_primary_score_overall: float = current_best_for_primary.fitness.values[0]
959
+ best_prompt_overall = chat_prompt.ChatPrompt(messages=current_best_for_primary)
807
960
  else:
808
961
  # Single-objective
809
962
  current_best_on_front = hof[0]
810
- best_primary_score_overall = current_best_on_front.fitness.values[0]
963
+ best_primary_score_overall: float = current_best_on_front.fitness.values[0]
811
964
 
812
965
  if self.enable_moo:
813
- logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {best_prompt_overall[:100]}...")
966
+ logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {json.dumps(best_prompt_overall.formatted_messages)[:100]}...")
814
967
  else:
815
968
  logger.info(f"Gen {0}: New best score: {best_primary_score_overall:.4f}")
816
969
 
@@ -826,132 +979,65 @@ Ensure a good mix of variations, all targeting the specified output style from t
826
979
  ).dict()
827
980
  self._add_to_history(initial_round_data)
828
981
 
829
- pbar_desc = f"{log_prefix} Evolutionary Optimization"
830
- pbar_postfix_key = "best_primary_score" if self.enable_moo else "best_score"
831
- pbar = tqdm(
832
- total=self.num_generations,
833
- desc=pbar_desc,
834
- unit="gen",
835
- disable=self.verbose < 1,
836
- postfix={pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter}
837
- )
982
+ with reporting.start_evolutionary_algo(verbose=self.verbose) as report_evolutionary_algo:
983
+ for generation_idx in range(1, self.num_generations + 1):
984
+ report_evolutionary_algo.start_gen(generation_idx, self.num_generations)
838
985
 
839
- gen = 0
840
- for gen_idx in range(1, self.num_generations + 1):
841
- gen = gen_idx
842
- self._current_generation = gen
843
- pbar.set_postfix({pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter})
844
- previous_best_primary_score_for_gen = best_primary_score_overall
845
-
846
- # Population restart logic
847
- current_pop_best_primary = 0.0
848
- if deap_population and deap_population[0].fitness.valid:
849
- current_pop_best_primary = max(ind.fitness.values[0] for ind in deap_population if ind.fitness.valid)
850
-
851
- if self._best_fitness_history and current_pop_best_primary < self._best_fitness_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD):
852
- self._generations_without_improvement += 1
853
- else:
854
- self._generations_without_improvement = 0
855
- self._best_fitness_history.append(current_pop_best_primary)
986
+ curr_best_score = self._population_best_score(deap_population)
856
987
 
857
- if self._generations_without_improvement >= self.DEFAULT_RESTART_GENERATIONS:
858
- logger.info(f"Detected stagnation in primary objective at gen {gen}. Restarting population...")
859
- elites_for_restart = list(hof) if self.enable_moo else list(tools.selBest(deap_population, self.elitism_size))
860
- seed_prompt_for_restart = str(max(elites_for_restart, key=lambda ind: ind.fitness.values[0])) if elites_for_restart else best_prompt_overall
861
-
862
- new_population_strings = self._initialize_population(initial_prompt=seed_prompt_for_restart, task_config=task_config)
863
- deap_population = [creator.Individual(p_str) for p_str in new_population_strings]
864
- self._generations_without_improvement = 0
865
- fitnesses_new = list(map(self.toolbox.evaluate, deap_population))
866
- for ind, fit in zip(deap_population, fitnesses_new):
867
- ind.fitness.values = fit
868
- # Offspring will be selected from this new population in the next step
869
-
870
- # Standard DEAP evolutionary algorithm steps
871
- if self.enable_moo:
872
- # NSGA-II is used for MOO
873
- offspring = self.toolbox.select(deap_population, self.population_size)
874
- else:
875
- # Single-objective: Elitism + Selection
876
- elites = tools.selBest(deap_population, self.elitism_size)
877
- selected_offspring = self.toolbox.select(deap_population, len(deap_population) - self.elitism_size)
878
- offspring = elites + selected_offspring
879
-
880
- # Set up the offspring for the next generation
881
- offspring = list(map(self.toolbox.clone, offspring))
882
- for child1, child2 in zip(offspring[::2], offspring[1::2]):
883
- if random.random() < self.crossover_rate:
884
- self.toolbox.mate(child1, child2)
885
- del child1.fitness.values
886
- del child2.fitness.values
887
-
888
- # Mutate the offspring
889
- current_mutation_rate = self._get_adaptive_mutation_rate()
890
- for mutant in offspring:
891
- if random.random() < current_mutation_rate:
892
- self.toolbox.mutate(mutant, task_config=task_config)
893
- del mutant.fitness.values
894
-
895
- # Evaluate the offspring
896
- invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
897
- fitnesses_eval = map(self.toolbox.evaluate, invalid_ind)
898
- for ind, fit in zip(invalid_ind, fitnesses_eval):
899
- ind.fitness.values = fit
900
-
901
- # Update the hall of fame
902
- hof.update(offspring)
903
- deap_population[:] = offspring # Replace population
988
+ # ---------- restart logic -------------------------------------
989
+ if self._should_restart_population(curr_best_score):
990
+ report_evolutionary_algo.restart_population(self.DEFAULT_RESTART_GENERATIONS)
991
+ deap_population = self._restart_population(
992
+ hof, deap_population, best_prompt_overall
993
+ )
904
994
 
905
- # Update overall best score and prompt (based on primary objective for consistency)
906
- if hof and len(hof) > 0:
907
- if self.enable_moo:
908
- current_best_on_front = max(hof, key=lambda ind: ind.fitness.values[0])
909
- updated_best_primary_score = current_best_on_front.fitness.values[0]
910
- else:
911
- # Single-objective
912
- current_best_on_front = hof[0]
913
- updated_best_primary_score = current_best_on_front.fitness.values[0]
914
-
915
- if updated_best_primary_score > best_primary_score_overall:
916
- best_primary_score_overall = updated_best_primary_score
917
- best_prompt_overall = str(current_best_on_front)
918
- logger.info(f"Gen {gen}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {best_prompt_overall[:100]}...")
919
- self._generations_without_overall_improvement = 0
920
- elif updated_best_primary_score == previous_best_primary_score_for_gen:
921
- # Check against score at start of this gen's logic
922
- self._generations_without_overall_improvement += 1
995
+ # ---------- run one generation --------------------------------
996
+ deap_population, invalid_count = self._run_generation(
997
+ generation_idx, deap_population, prompt, hof, report_evolutionary_algo, best_primary_score_overall
998
+ )
999
+
1000
+ # -------- update best-prompt bookkeeping -------------------------
1001
+ previous_best_primary_score_for_gen = best_primary_score_overall
1002
+ if hof:
1003
+ if self.enable_moo:
1004
+ current_best_ind = max(hof, key=lambda ind: ind.fitness.values[0])
1005
+ else:
1006
+ current_best_ind = hof[0]
1007
+
1008
+ updated_best_primary_score = current_best_ind.fitness.values[0]
1009
+ if updated_best_primary_score > best_primary_score_overall:
1010
+ best_primary_score_overall = updated_best_primary_score
1011
+ self._generations_without_overall_improvement = 0
1012
+ elif updated_best_primary_score == previous_best_primary_score_for_gen:
1013
+ self._generations_without_overall_improvement += 1
1014
+ else:
1015
+ self._generations_without_overall_improvement += 1
923
1016
  else:
924
- # Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
925
1017
  self._generations_without_overall_improvement += 1
926
- else:
927
- # Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
928
- self._generations_without_overall_improvement += 1
929
-
930
- record = record_stats.compile(deap_population) if record_stats else {}
931
- logbook.record(gen=gen, evals=len(invalid_ind), **record)
932
- if self.verbose >= 1:
933
- print(logbook.stream)
934
-
935
- # History logging for this transition
936
- # FIXME: Use model.dump() instead of dict()
937
- gen_round_data = OptimizationRound(
938
- round_number=gen,
939
- current_prompt=best_prompt_overall, # Representative best
940
- current_score=best_primary_score_overall,
941
- generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
942
- best_prompt=best_prompt_overall,
943
- best_score=best_primary_score_overall,
944
- improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
945
- ).dict()
946
- self._add_to_history(gen_round_data)
947
- pbar.update(1)
948
1018
 
949
- if self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS:
950
- logger.info(f"Overall best score has not improved for {self.DEFAULT_EARLY_STOPPING_GENERATIONS} generations. Stopping early at gen {gen}.")
951
- break
1019
+ # ---------- early-stopping check ------------------------------
1020
+ if self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS:
1021
+ logger.info(
1022
+ "No overall improvement for %d generations – early stopping at gen %d.",
1023
+ self.DEFAULT_EARLY_STOPPING_GENERATIONS,
1024
+ generation_idx,
1025
+ )
1026
+ break
1027
+
1028
+ # History logging for this transition
1029
+ # FIXME: Use model.dump() instead of dict()
1030
+ gen_round_data = OptimizationRound(
1031
+ round_number=generation_idx,
1032
+ current_prompt=best_prompt_overall, # Representative best
1033
+ current_score=best_primary_score_overall,
1034
+ generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
1035
+ best_prompt=best_prompt_overall,
1036
+ best_score=best_primary_score_overall,
1037
+ improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
1038
+ ).dict()
1039
+ self._add_to_history(gen_round_data)
952
1040
 
953
- pbar.close()
954
- logger.info(f"\n{log_prefix} Evolutionary Optimization finished after {gen} generations.")
955
1041
  stopped_early_flag = self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
956
1042
  final_details = {}
957
1043
  initial_score_for_display = initial_primary_score
@@ -963,12 +1049,12 @@ Ensure a good mix of variations, all targeting the specified output style from t
963
1049
  for i, sol in enumerate(sorted_hof):
964
1050
  final_results_log += f" Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
965
1051
  best_overall_solution = sorted_hof[0]
966
- final_best_prompt = str(best_overall_solution)
1052
+ final_best_prompt = chat_prompt.ChatPrompt(messages=best_overall_solution)
967
1053
  final_primary_score = best_overall_solution.fitness.values[0]
968
1054
  final_length = best_overall_solution.fitness.values[1]
969
1055
  logger.info(final_results_log)
970
1056
  logger.info(f"Representative best prompt (highest primary score from Pareto front): '{final_best_prompt}'")
971
- logger.info(f" Primary Score ({metric_config.metric.name}): {final_primary_score:.4f}")
1057
+ logger.info(f" Primary Score ({metric.__name__}): {final_primary_score:.4f}")
972
1058
  logger.info(f" Length: {final_length:.0f}")
973
1059
  final_details.update({
974
1060
  "initial_primary_score": initial_primary_score,
@@ -986,7 +1072,7 @@ Ensure a good mix of variations, all targeting the specified output style from t
986
1072
  logger.warning("MOO: ParetoFront is empty. Reporting last known best.")
987
1073
  final_best_prompt = best_prompt_overall
988
1074
  final_primary_score = best_primary_score_overall
989
- final_length = float(len(final_best_prompt))
1075
+ final_length = float(len(json.dumps(final_best_prompt.formatted_messages)))
990
1076
  final_details.update({"initial_primary_score": initial_primary_score, "initial_length": initial_length,
991
1077
  "final_prompt_representative": final_best_prompt, "final_primary_score_representative": final_primary_score,
992
1078
  "final_length_representative": final_length, "pareto_front_solutions": []})
@@ -995,9 +1081,9 @@ Ensure a good mix of variations, all targeting the specified output style from t
995
1081
  final_best_prompt = best_prompt_overall
996
1082
  final_primary_score = best_primary_score_overall
997
1083
  logger.info(f"Final best prompt from Hall of Fame: '{final_best_prompt}'")
998
- logger.info(f"Final best score ({metric_config.metric.name}): {final_primary_score:.4f}")
1084
+ logger.info(f"Final best score ({metric.__name__}): {final_primary_score:.4f}")
999
1085
  final_details.update({
1000
- "initial_prompt": task_config.instruction_prompt,
1086
+ "initial_prompt": prompt.formatted_messages,
1001
1087
  "initial_score": initial_primary_score,
1002
1088
  "initial_score_for_display": initial_primary_score,
1003
1089
  "final_prompt": final_best_prompt,
@@ -1014,15 +1100,13 @@ Ensure a good mix of variations, all targeting the specified output style from t
1014
1100
 
1015
1101
  # Add final details
1016
1102
  final_details.update({
1017
- "total_generations_run": gen,
1103
+ "total_generations_run": generation_idx + 1,
1018
1104
  "population_size": self.population_size,
1019
1105
  "mutation_probability": self.mutation_rate,
1020
1106
  "crossover_probability": self.crossover_rate,
1021
1107
  "elitism_size": self.elitism_size if not self.enable_moo else "N/A (MOO uses NSGA-II)",
1022
1108
  "adaptive_mutation": self.adaptive_mutation,
1023
- "deap_logbook": logbook.stream if logbook else "Not available",
1024
- "task_config": task_config.dict(),
1025
- "metric_config": metric_config.dict(),
1109
+ "metric_name": metric.__name__,
1026
1110
  "model": self.model,
1027
1111
  "moo_enabled": self.enable_moo,
1028
1112
  "llm_crossover_enabled": self.enable_llm_crossover,
@@ -1039,11 +1123,17 @@ Ensure a good mix of variations, all targeting the specified output style from t
1039
1123
  })
1040
1124
 
1041
1125
  # Return the OptimizationResult
1126
+ reporting.display_result(
1127
+ initial_score=initial_score_for_display,
1128
+ best_score=final_primary_score,
1129
+ best_prompt=final_best_prompt.formatted_messages,
1130
+ verbose=self.verbose
1131
+ )
1042
1132
  return OptimizationResult(
1043
1133
  optimizer=self.__class__.__name__,
1044
- prompt=final_best_prompt,
1134
+ prompt=final_best_prompt.formatted_messages,
1045
1135
  score=final_primary_score,
1046
- metric_name=metric_config.metric.name,
1136
+ metric_name=metric.__name__,
1047
1137
  details=final_details,
1048
1138
  history=self.get_history(),
1049
1139
  llm_calls=self.llm_call_counter
@@ -1052,8 +1142,7 @@ Ensure a good mix of variations, all targeting the specified output style from t
1052
1142
  @_throttle.rate_limited(_rate_limiter)
1053
1143
  def _call_model(
1054
1144
  self,
1055
- prompt: str,
1056
- system_prompt: Optional[str] = None,
1145
+ messages: List[Dict[str, str]],
1057
1146
  is_reasoning: bool = False,
1058
1147
  optimization_id: Optional[str] = None,
1059
1148
  ) -> str:
@@ -1084,11 +1173,6 @@ Ensure a good mix of variations, all targeting the specified output style from t
1084
1173
  if metadata_for_opik:
1085
1174
  llm_config_params["metadata"] = metadata_for_opik
1086
1175
 
1087
- messages = []
1088
- if system_prompt:
1089
- messages.append({"role": "system", "content": system_prompt})
1090
- messages.append({"role": "user", "content": prompt})
1091
-
1092
1176
  # Pass llm_config_params to the Opik monitor
1093
1177
  final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
1094
1178
  llm_config_params.copy()
@@ -1102,14 +1186,16 @@ Ensure a good mix of variations, all targeting the specified output style from t
1102
1186
  response = litellm.completion(
1103
1187
  model=self.model, messages=messages, **final_call_params
1104
1188
  )
1189
+
1190
+ logger.debug(f"Response: {response}")
1105
1191
  return response.choices[0].message.content
1106
- except litellm.exceptions.RateLimitError as e:
1192
+ except litellm_exceptions.RateLimitError as e:
1107
1193
  logger.error(f"LiteLLM Rate Limit Error: {e}")
1108
1194
  raise
1109
- except litellm.exceptions.APIConnectionError as e:
1195
+ except litellm_exceptions.APIConnectionError as e:
1110
1196
  logger.error(f"LiteLLM API Connection Error: {e}")
1111
1197
  raise
1112
- except litellm.exceptions.ContextWindowExceededError as e:
1198
+ except litellm_exceptions.ContextWindowExceededError as e:
1113
1199
  logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
1114
1200
  raise
1115
1201
  except Exception as e:
@@ -1118,10 +1204,9 @@ Ensure a good mix of variations, all targeting the specified output style from t
1118
1204
 
1119
1205
  def evaluate_prompt(
1120
1206
  self,
1121
- dataset: Union[str, opik.Dataset],
1122
- metric_config: MetricConfig,
1123
- task_config: TaskConfig,
1124
- prompt: str,
1207
+ prompt: chat_prompt.ChatPrompt,
1208
+ dataset: opik.Dataset,
1209
+ metric: Callable,
1125
1210
  n_samples: Optional[int] = None,
1126
1211
  dataset_item_ids: Optional[List[str]] = None,
1127
1212
  experiment_config: Optional[Dict] = None,
@@ -1130,57 +1215,32 @@ Ensure a good mix of variations, all targeting the specified output style from t
1130
1215
  ) -> float:
1131
1216
  """
1132
1217
  Evaluate a single prompt (individual) against the dataset.
1133
- Adapted from MetaPromptOptimizer._evaluate_prompt.
1218
+
1219
+ Args:
1220
+ prompt: The prompt to evaluate
1221
+ dataset: The dataset to use for evaluation
1222
+ metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
1223
+ n_samples: Optional number of samples to use
1224
+ dataset_item_ids: Optional list of dataset item IDs to use
1225
+ experiment_config: Optional experiment configuration
1226
+ optimization_id: Optional optimization ID
1227
+ verbose: Controls internal logging/progress bars (0=off, 1=on).
1228
+
1229
+ Returns:
1230
+ float: The metric value
1134
1231
  """
1135
- effective_verbose = self.verbose if verbose == 0 else verbose
1136
-
1137
- if isinstance(dataset, str):
1138
- # This should ideally be done once in optimize_prompt if dataset is a string
1139
- # but if called standalone, we need to handle it.
1140
- # TODO Move to base class
1141
- opik_eval_dataset = self._opik_client.get_dataset(dataset)
1142
- else:
1143
- opik_eval_dataset = dataset
1144
-
1145
- total_items = len(opik_eval_dataset.get_items())
1232
+ total_items = len(dataset.get_items())
1146
1233
 
1147
- # Determine subset_size for this evaluation run
1148
- # TODO Move to dataset utils
1149
- if dataset_item_ids:
1150
- subset_size = len(dataset_item_ids)
1151
- logger.debug(f"Using provided {subset_size} dataset_item_ids for evaluation.")
1152
- elif n_samples is not None:
1153
- if n_samples > total_items:
1154
- logger.warning(
1155
- f"Requested n_samples ({n_samples}) for individual evaluation is larger than dataset size ({total_items}). Using full dataset."
1156
- )
1157
- subset_size = None
1158
- elif n_samples <= 0:
1159
- logger.warning(
1160
- f"Requested n_samples ({n_samples}) is <=0. Using full dataset for this evaluation."
1161
- )
1162
- subset_size = None
1163
- else:
1164
- subset_size = n_samples
1165
- logger.debug(f"Using specified n_samples: {subset_size} items for this evaluation run.")
1166
- else:
1167
- # Default behavior if no n_samples and no dataset_item_ids are given for this specific call
1168
- # This case should be rare if n_samples is passed down from optimize_prompt
1169
- subset_size = min(total_items, min(20, max(10, int(total_items * 0.2))))
1170
- logger.debug(
1171
- f"Using automatic subset size for this evaluation: {subset_size} items (20% of {total_items} total items)"
1172
- )
1173
-
1174
1234
  current_experiment_config = experiment_config or {}
1175
1235
  current_experiment_config = {
1176
1236
  **current_experiment_config,
1177
1237
  **{
1178
1238
  "optimizer": self.__class__.__name__,
1179
- "metric": metric_config.metric.name,
1180
- "dataset": opik_eval_dataset.name,
1239
+ "metric": metric.__name__,
1240
+ "dataset": dataset.name,
1181
1241
  "configuration": {
1182
- "prompt_evaluated": prompt,
1183
- "n_samples_for_eval": subset_size if dataset_item_ids is None else len(dataset_item_ids),
1242
+ "prompt": prompt.formatted_messages,
1243
+ "n_samples_for_eval": len(dataset_item_ids) if dataset_item_ids is not None else n_samples,
1184
1244
  "total_dataset_items": total_items,
1185
1245
  },
1186
1246
  },
@@ -1189,81 +1249,35 @@ Ensure a good mix of variations, all targeting the specified output style from t
1189
1249
  def llm_task(
1190
1250
  dataset_item: Dict[str, Any]
1191
1251
  ) -> Dict[str, str]:
1192
- if hasattr(dataset_item, "to_dict"):
1193
- dataset_item = dataset_item.to_dict()
1194
-
1195
- for input_key in task_config.input_dataset_fields:
1196
- if input_key not in dataset_item:
1197
- raise ValueError(f"Input field '{input_key}' not found in dataset sample: {dataset_item}")
1198
- if task_config.output_dataset_field not in dataset_item:
1199
- raise ValueError(f"Output field '{task_config.output_dataset_field}' not found in dataset sample: {dataset_item}")
1200
-
1201
- prompt_for_llm: str
1202
- field_mapping = {
1203
- field: dataset_item[field]
1204
- for field in task_config.input_dataset_fields
1205
- if field in dataset_item
1206
- }
1207
-
1208
- if getattr(task_config, "use_chat_prompt", False):
1209
- candidate_template = Template(prompt)
1210
- user_content_parts = []
1211
- for field_name in task_config.input_dataset_fields:
1212
- if field_name in dataset_item:
1213
- user_content_parts.append(f"{field_name.capitalize()}: {dataset_item[field_name]}")
1214
- user_content = "\n".join(user_content_parts)
1215
-
1216
- raw_model_output = self._call_model(
1217
- prompt=user_content,
1218
- system_prompt=prompt,
1219
- is_reasoning=False
1220
- )
1221
-
1222
- else:
1223
- input_clauses = []
1224
- for field_name in task_config.input_dataset_fields:
1225
- if field_name in dataset_item:
1226
- input_clauses.append(
1227
- f"{field_name.capitalize()}: {dataset_item[field_name]}"
1228
- )
1229
- item_specific_inputs_str = "\n".join(input_clauses)
1230
- prompt_for_llm = f"{prompt}\n\n{item_specific_inputs_str}"
1231
-
1232
- raw_model_output = self._call_model(
1233
- prompt=prompt_for_llm,
1234
- system_prompt=None,
1235
- is_reasoning=False
1236
- )
1252
+ try:
1253
+ messages = [{
1254
+ "role": item["role"],
1255
+ "content": item["content"].format(**dataset_item)
1256
+ } for item in prompt.formatted_messages]
1257
+ except Exception as e:
1258
+ logger.warning(f"Error in llm_task, this is usually a parsing error: {e}")
1259
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: ""}
1237
1260
 
1238
- cleaned_model_output = raw_model_output.strip()
1239
- output_field = task_config.output_dataset_field
1240
- prefixes_to_strip = [f"{output_field.capitalize()}:", f"{output_field}:", "Answer:"]
1241
- for prefix in prefixes_to_strip:
1242
- if cleaned_model_output.lower().startswith(prefix.lower()):
1243
- cleaned_model_output = cleaned_model_output[len(prefix):].strip()
1244
- break
1261
+ model_output = self._call_model(
1262
+ messages=messages,
1263
+ is_reasoning=False
1264
+ )
1245
1265
 
1246
- return {mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output}
1247
-
1248
- logger.debug(
1249
- f"Starting evaluation for a prompt with {subset_size if subset_size else 'all'} samples (or specific IDs) for metric: {metric_config.metric.name}"
1250
- )
1266
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
1251
1267
 
1252
1268
  # Evaluate the prompt
1253
1269
  score = task_evaluator.evaluate(
1254
- dataset=opik_eval_dataset,
1270
+ dataset=dataset,
1255
1271
  dataset_item_ids=dataset_item_ids,
1256
- metric_config=metric_config,
1272
+ metric=metric,
1257
1273
  evaluated_task=llm_task,
1258
1274
  num_threads=self.num_threads,
1259
1275
  project_name=self.project_name,
1260
- n_samples=subset_size if dataset_item_ids is None else None,
1276
+ n_samples=n_samples if dataset_item_ids is None else None,
1261
1277
  experiment_config=current_experiment_config,
1262
1278
  optimization_id=optimization_id,
1263
- # FIXME: Hack for verbose till its merged
1264
- #verbose=effective_verbose,
1279
+ verbose=verbose
1265
1280
  )
1266
- logger.debug(f"Evaluation score for prompt: {score:.4f}")
1267
1281
  return score
1268
1282
 
1269
1283
  def _llm_deap_crossover(
@@ -1272,42 +1286,48 @@ Ensure a good mix of variations, all targeting the specified output style from t
1272
1286
  ind2: "creator.Individual"
1273
1287
  ) -> Tuple["creator.Individual", "creator.Individual"]:
1274
1288
  """Perform crossover by asking an LLM to blend two parent prompts."""
1275
- parent1_str = str(ind1)
1276
- parent2_str = str(ind2)
1289
+ reporting.display_message(" Recombining prompts using an LLM.", verbose=self.verbose)
1290
+
1291
+ parent1_messages: List[Dict[Literal["role", "content"], str]] = ind1
1292
+ parent2_messages: List[Dict[Literal["role", "content"], str]] = ind2
1277
1293
  current_output_style_guidance = self.output_style_guidance
1278
1294
 
1279
1295
  user_prompt_for_llm_crossover = f"""Parent Prompt 1:
1280
- '''{parent1_str}'''
1296
+ '''{parent1_messages}'''
1281
1297
 
1282
1298
  Parent Prompt 2:
1283
- '''{parent2_str}'''
1299
+ '''{parent2_messages}'''
1284
1300
 
1285
1301
  Desired output style from target LLM for children prompts: '{current_output_style_guidance}'
1286
1302
 
1287
- Please generate one or two child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
1288
- Follow the instructions provided in the system prompt regarding the JSON output format ({{"children_prompts": ["child1", ... ]}}).
1303
+ Please generate TWO child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
1304
+ Follow the instructions provided in the system prompt regarding the JSON output format:
1305
+ [
1306
+ [{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_1
1307
+ [{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_2
1308
+ ]
1289
1309
  """
1290
1310
  try:
1291
- logger.debug(f"Attempting LLM-driven crossover between: '{parent1_str[:50]}...' and '{parent2_str[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'")
1311
+ logger.debug(f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'")
1292
1312
  response_content = self._call_model(
1293
- prompt=user_prompt_for_llm_crossover,
1294
- system_prompt=self.get_llm_crossover_system_prompt(),
1313
+ messages=[
1314
+ {"role": "system", "content": self.get_llm_crossover_system_prompt()},
1315
+ {"role": "user", "content": user_prompt_for_llm_crossover},
1316
+ ],
1295
1317
  is_reasoning=True
1296
1318
  )
1297
1319
  logger.debug(f"Raw LLM response for crossover: {response_content}")
1298
1320
 
1299
- json_response = json.loads(response_content)
1300
- children_strings = json_response.get("children_prompts", [])
1301
-
1302
- if not children_strings or not isinstance(children_strings, list) or not all(isinstance(cs, str) for cs in children_strings):
1321
+ json_response = utils.json_to_dict(response_content)
1322
+ if not isinstance(json_response, list) or len(json_response) != 2 or not all(isinstance(cs, list) for cs in json_response):
1303
1323
  logger.warning("LLM Crossover: Malformed or empty children_prompts list. Falling back.")
1304
1324
  raise ValueError("Malformed LLM crossover response")
1305
1325
 
1306
- child1_str = children_strings[0]
1307
- child2_str = children_strings[1] if len(children_strings) > 1 else self._deap_mutation(creator.Individual(parent2_str), task_config=None)[0] # task_config might not be available or needed here for simple mutation
1326
+ child1: List[Dict[Literal["role", "content"], str]] = json_response[0]
1327
+ child2: List[Dict[Literal["role", "content"], str]] = json_response[1]
1308
1328
 
1309
- logger.debug(f"LLM Crossover generated child1: {child1_str[:50]}... Child2: {child2_str[:50]}...")
1310
- return creator.Individual(child1_str), creator.Individual(str(child2_str))
1329
+ logger.debug(f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}...")
1330
+ return creator.Individual(child1), creator.Individual(child2)
1311
1331
 
1312
1332
  except Exception as e:
1313
1333
  logger.warning(f"LLM-driven crossover failed: {e}. Falling back to standard crossover.")
@@ -1315,17 +1335,15 @@ Follow the instructions provided in the system prompt regarding the JSON output
1315
1335
 
1316
1336
  def _get_task_description_for_llm(
1317
1337
  self,
1318
- task_config: TaskConfig
1338
+ prompt: chat_prompt.ChatPrompt
1319
1339
  ) -> str:
1320
1340
  """Generates a concise task description for use in LLM prompts for fresh generation or radical innovation."""
1321
- input_fields_str = ", ".join(task_config.input_dataset_fields)
1322
- output_field_str = task_config.output_dataset_field
1323
- description = f"Task: Given input(s) from field(s) '{input_fields_str}', generate a response for the field '{output_field_str}'. "
1324
- description += f"The original high-level instruction being optimized is: '{task_config.instruction_prompt}'. "
1341
+ description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
1342
+ description += f"The original high-level instruction being optimized is: '{prompt.formatted_messages}'. "
1325
1343
  description += "The goal is to create an effective prompt that guides a language model to perform this task well."
1326
1344
  return description
1327
1345
 
1328
- def get_reasoning_system_prompt_for_variation(self) -> str:
1346
+ def _get_reasoning_system_prompt_for_variation(self) -> str:
1329
1347
  return f"""You are an expert prompt engineer specializing in creating diverse and effective prompts. Given an initial prompt, your task is to generate a diverse set of alternative prompts.
1330
1348
 
1331
1349
  For each prompt variation, consider:
@@ -1367,13 +1385,18 @@ Consider the following when generating children:
1367
1385
  - You can create a child that is a direct blend, or one that takes a primary structure from one parent and incorporates specific elements from the other, always optimizing for clear instruction towards the desired output style.
1368
1386
  - If generating two children, try to make them distinct from each other and from the parents, perhaps by emphasizing different aspects of the parental combination that could lead to the desired output style.
1369
1387
 
1370
- Return a JSON object with a single key "children_prompts", which is a list of strings. Each string is a child prompt.
1371
- Example for one child: {{"children_prompts": ["child prompt 1 designed for specified style"]}}
1372
- Example for two children: {{"children_prompts": ["child prompt 1 for target style", "child prompt 2 also for target style"]}}
1373
- Generate at least one child, and at most two. All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
1388
+ All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
1389
+
1390
+ Return a JSON object that is a list of both child prompts. Each child prompt is a list of LLM messages. Example:
1391
+ [
1392
+ [{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}],
1393
+ [{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}]
1394
+ ]
1395
+
1396
+
1374
1397
  """
1375
1398
 
1376
- def get_radical_innovation_system_prompt(self) -> str:
1399
+ def _get_radical_innovation_system_prompt(self) -> str:
1377
1400
  return f"""You are an expert prompt engineer and a creative problem solver.
1378
1401
  Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
1379
1402
  Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
@@ -1384,66 +1407,59 @@ Return only the new prompt string, with no preamble or explanation.
1384
1407
  def _infer_output_style_from_dataset(
1385
1408
  self,
1386
1409
  dataset: opik.Dataset,
1387
- task_config: TaskConfig,
1410
+ prompt: chat_prompt.ChatPrompt,
1388
1411
  n_examples: int = 5
1389
1412
  ) -> Optional[str]:
1390
1413
  """Analyzes dataset examples to infer the desired output style."""
1391
- logger.info(f"Attempting to infer output style from up to {n_examples} dataset examples...")
1392
- try:
1393
- all_items = dataset.get_items()
1394
- except Exception as e:
1395
- logger.error(f"Failed to get items from dataset '{dataset.name}': {e}")
1396
- return None
1397
-
1398
- if not all_items:
1399
- logger.warning(f"Dataset '{dataset.name}' is empty. Cannot infer output style.")
1400
- return None
1401
-
1402
- # Take the first n_examples
1403
- items_to_process = all_items[:n_examples]
1404
-
1405
- # Need at least a couple of examples for meaningful inference
1406
- if len(items_to_process) < min(n_examples, 2):
1407
- logger.warning(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
1408
- return None
1409
-
1410
- examples_str = ""
1411
- for i, item_obj in enumerate(items_to_process):
1412
- item_content = item_obj.content if hasattr(item_obj, 'content') else item_obj
1413
- if not isinstance(item_content, dict):
1414
- logger.warning(f"Dataset item {i} does not have a .content dictionary or is not a dict itself. Skipping item: {item_obj}")
1415
- continue
1414
+ with reporting.infer_output_style(verbose=self.verbose) as report_infer_output_style:
1415
+ report_infer_output_style.start_style_inference(n_examples)
1416
+
1417
+ try:
1418
+ items_to_process = dataset.get_items(n_examples)
1419
+ except Exception as e:
1420
+ report_infer_output_style.error(f"Failed to get items from dataset '{dataset.name}': {e}")
1421
+ return None
1422
+
1423
+ if not items_to_process:
1424
+ report_infer_output_style.error(f"Dataset '{dataset.name}' is empty. Cannot infer output style.")
1425
+ return None
1416
1426
 
1417
- input_parts = []
1418
- for field in task_config.input_dataset_fields:
1419
- if field in item_content:
1420
- input_parts.append(f"{field.capitalize()}: {item_content[field]}")
1421
- input_str = "\n".join(input_parts)
1422
- output_str = item_content.get(task_config.output_dataset_field, "[NO OUTPUT FIELD FOUND]")
1423
- examples_str += f"Example {i+1}:\nInput(s):\n{input_str}\nOutput: {output_str}\n---\n"
1427
+ # Need at least a couple of examples for meaningful inference
1428
+ if len(items_to_process) < min(n_examples, 2):
1429
+ report_infer_output_style.error(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
1430
+ return None
1424
1431
 
1425
- user_prompt_for_style_inference = f"""Please analyze the following input-output examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. This description will be used to guide other LLMs in generating and refining prompts.
1432
+ examples_str = ""
1433
+ for i, item_content in enumerate(items_to_process):
1434
+ filtered_content = {x: y for x, y in item_content.items() if x != "id"}
1435
+ examples_str += f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
1426
1436
 
1427
- {examples_str}
1437
+ user_prompt_for_style_inference = f"""Please analyze the following examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. Before describing the output style, make sure to understand the dataset content and structure as it can include input, output and metadata fields. This description will be used to guide other LLMs in generating and refining prompts.
1428
1438
 
1429
- Based on these examples, what is the desired output style description?
1430
- Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
1431
- The description should be a single string that can be directly used as an instruction for another LLM.
1432
- Return ONLY this descriptive string.
1433
- """
1434
- try:
1435
- inferred_style = self._call_model(
1436
- prompt=user_prompt_for_style_inference,
1437
- system_prompt=self._INFER_STYLE_SYSTEM_PROMPT,
1438
- is_reasoning=True
1439
- )
1440
- inferred_style = inferred_style.strip()
1441
- if inferred_style:
1442
- logger.info(f"Inferred output style: '{inferred_style}'")
1443
- return inferred_style
1444
- else:
1445
- logger.warning("LLM returned empty string for inferred output style.")
1439
+ {examples_str}
1440
+
1441
+ Based on these examples, what is the desired output style description?
1442
+ Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
1443
+ The description should be a single string that can be directly used as an instruction for another LLM.
1444
+ Return ONLY this descriptive string.
1445
+ """
1446
+ #report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
1447
+
1448
+ try:
1449
+ inferred_style = self._call_model(
1450
+ messages=[
1451
+ {"role": "system", "content": self._INFER_STYLE_SYSTEM_PROMPT},
1452
+ {"role": "user", "content": user_prompt_for_style_inference}
1453
+ ],
1454
+ is_reasoning=True
1455
+ )
1456
+ inferred_style = inferred_style.strip()
1457
+ if inferred_style:
1458
+ report_infer_output_style.success(inferred_style)
1459
+ return inferred_style
1460
+ else:
1461
+ report_infer_output_style.error("LLM returned empty string for inferred output style.")
1462
+ return None
1463
+ except Exception as e:
1464
+ report_infer_output_style.error(f"Error during output style inference: {e}")
1446
1465
  return None
1447
- except Exception as e:
1448
- logger.error(f"Error during output style inference: {e}")
1449
- return None