opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -3
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +41 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +28 -20
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +96 -46
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +122 -37
  40. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.0.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.1.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,9 @@ import json
2
2
  import logging
3
3
  import os
4
4
  import random
5
- from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, cast
5
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, cast, Type
6
6
 
7
- import Levenshtein
7
+ import rapidfuzz.distance.Indel
8
8
  import litellm
9
9
  import numpy as np
10
10
  import opik
@@ -23,6 +23,7 @@ from opik_optimizer import _throttle, task_evaluator
23
23
  from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
24
24
  from opik_optimizer.optimization_config import chat_prompt, mappers
25
25
  from opik_optimizer.optimization_result import OptimizationResult
26
+ from opik_optimizer.optimizable_agent import OptimizableAgent
26
27
 
27
28
  from .. import utils
28
29
  from . import reporting
@@ -37,6 +38,7 @@ litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
37
38
 
38
39
  creator = cast(Any, _creator) # type: ignore[assignment]
39
40
 
41
+
40
42
  class EvolutionaryOptimizer(BaseOptimizer):
41
43
  """
42
44
  The Evolutionary Optimizer can be used to optimize prompts using a 4 stage genetic algorithm
@@ -47,10 +49,10 @@ class EvolutionaryOptimizer(BaseOptimizer):
47
49
  2. Evaluate the candidate prompts
48
50
  3. Select the best prompts
49
51
  4. Repeat until convergence
50
-
52
+
51
53
  This algorithm is best used if you have a first draft prompt and would like to find a better
52
54
  prompt.
53
-
55
+
54
56
  Note: This algorithm is time consuming and can be expensive to run.
55
57
  """
56
58
 
@@ -65,15 +67,17 @@ class EvolutionaryOptimizer(BaseOptimizer):
65
67
  DEFAULT_MIN_MUTATION_RATE = 0.1
66
68
  DEFAULT_MAX_MUTATION_RATE = 0.4
67
69
  DEFAULT_ADAPTIVE_MUTATION = True
68
- DEFAULT_DIVERSITY_THRESHOLD = 0.7
69
- DEFAULT_RESTART_THRESHOLD = 0.01
70
- DEFAULT_RESTART_GENERATIONS = 3
71
- DEFAULT_CACHE_SIZE = 1000
70
+ DEFAULT_DIVERSITY_THRESHOLD = 0.7
71
+ DEFAULT_RESTART_THRESHOLD = 0.01
72
+ DEFAULT_RESTART_GENERATIONS = 3
73
+ DEFAULT_CACHE_SIZE = 1000
72
74
  DEFAULT_EARLY_STOPPING_GENERATIONS = 5
73
- DEFAULT_ENABLE_MOO = True
74
- DEFAULT_ENABLE_LLM_CROSSOVER = True
75
+ DEFAULT_ENABLE_MOO = True
76
+ DEFAULT_ENABLE_LLM_CROSSOVER = True
75
77
  DEFAULT_SEED = 42
76
- DEFAULT_OUTPUT_STYLE_GUIDANCE = "Produce clear, effective, and high-quality responses suitable for the task."
78
+ DEFAULT_OUTPUT_STYLE_GUIDANCE = (
79
+ "Produce clear, effective, and high-quality responses suitable for the task."
80
+ )
77
81
  DEFAULT_MOO_WEIGHTS = (1.0, -1.0) # (Maximize Score, Minimize Length)
78
82
 
79
83
  _INFER_STYLE_SYSTEM_PROMPT = """You are an expert in linguistic analysis and prompt engineering. Your task is to analyze a few input-output examples from a dataset and provide a concise, actionable description of the desired output style. This description will be used to guide other LLMs in generating and refining prompts.
@@ -85,7 +89,7 @@ Focus on characteristics like:
85
89
  - **Content Details**: (e.g., includes only the answer, includes reasoning, provides examples, avoids pleasantries)
86
90
  - **Keywords/Phrasing**: Any recurring keywords or phrasing patterns in the outputs.
87
91
 
88
- Provide a single string that summarizes this style. This summary should be directly usable as an instruction for another LLM.
92
+ Provide a single string that summarizes this style. This summary should be directly usable as an instruction for another LLM.
89
93
  For example: 'Outputs should be a single, concise proper noun.' OR 'Outputs should be a short paragraph explaining the reasoning, followed by a direct answer, avoiding conversational pleasantries.' OR 'Outputs are typically 1-2 sentences, providing a direct factual answer.'
90
94
  Return ONLY this descriptive string, with no preamble or extra formatting.
91
95
  """
@@ -93,7 +97,6 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
93
97
  def __init__(
94
98
  self,
95
99
  model: str,
96
- project_name: str = "Optimization",
97
100
  population_size: int = DEFAULT_POPULATION_SIZE,
98
101
  num_generations: int = DEFAULT_NUM_GENERATIONS,
99
102
  mutation_rate: float = DEFAULT_MUTATION_RATE,
@@ -108,12 +111,11 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
108
111
  output_style_guidance: Optional[str] = None,
109
112
  infer_output_style: bool = False,
110
113
  verbose: int = 1,
111
- **model_kwargs,
112
- ):
114
+ **model_kwargs: Any,
115
+ ) -> None:
113
116
  """
114
117
  Args:
115
118
  model: The model to use for evaluation
116
- project_name: Optional project name for tracking
117
119
  population_size: Number of prompts in the population
118
120
  num_generations: Number of generations to run
119
121
  mutation_rate: Mutation rate for genetic operations
@@ -131,7 +133,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
131
133
  **model_kwargs: Additional model parameters
132
134
  """
133
135
  # Initialize base class first
134
- super().__init__(model=model, project_name=project_name, **model_kwargs)
136
+ super().__init__(model=model, verbose=verbose, **model_kwargs)
135
137
  self.population_size = population_size
136
138
  self.num_generations = num_generations
137
139
  self.mutation_rate = mutation_rate
@@ -143,20 +145,22 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
143
145
  self.enable_moo = enable_moo
144
146
  self.enable_llm_crossover = enable_llm_crossover
145
147
  self.seed = seed
146
- self.output_style_guidance = output_style_guidance if output_style_guidance is not None else self.DEFAULT_OUTPUT_STYLE_GUIDANCE
148
+ self.output_style_guidance = (
149
+ output_style_guidance
150
+ if output_style_guidance is not None
151
+ else self.DEFAULT_OUTPUT_STYLE_GUIDANCE
152
+ )
147
153
  self.infer_output_style = infer_output_style
148
154
  self.llm_call_counter = 0
149
155
  self._opik_client = opik_client.get_client_cached()
150
- self._current_optimization_id = None
156
+ self._current_optimization_id: Optional[str] = None
151
157
  self._current_generation = 0
152
- self._best_fitness_history = []
158
+ self._best_fitness_history: List[float] = []
153
159
  self._generations_without_improvement = 0
154
- self._llm_cache = {}
155
- self._current_population = []
160
+ self._current_population: List[Any] = []
156
161
  self._generations_without_overall_improvement = 0
157
- self._best_primary_score_history: list[float] = []
162
+ self._best_primary_score_history: List[float] = []
158
163
  self._gens_since_pop_improvement: int = 0
159
- self.verbose = verbose
160
164
 
161
165
  if self.seed is not None:
162
166
  random.seed(self.seed)
@@ -167,33 +171,44 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
167
171
 
168
172
  if self.enable_moo:
169
173
  if not hasattr(creator, "FitnessMulti"):
170
- creator.create("FitnessMulti", base.Fitness, weights=self.DEFAULT_MOO_WEIGHTS)
174
+ creator.create(
175
+ "FitnessMulti", base.Fitness, weights=self.DEFAULT_MOO_WEIGHTS
176
+ )
171
177
  fitness_attr = creator.FitnessMulti
172
178
  else:
173
179
  if not hasattr(creator, "FitnessMax"):
174
180
  creator.create("FitnessMax", base.Fitness, weights=(1.0,))
175
181
  fitness_attr = creator.FitnessMax
176
-
177
- if not hasattr(creator, "Individual") or getattr(creator.Individual, "fitness") != fitness_attr:
182
+
183
+ if (
184
+ not hasattr(creator, "Individual")
185
+ or getattr(creator.Individual, "fitness") != fitness_attr
186
+ ):
178
187
  if hasattr(creator, "Individual"):
179
188
  del creator.Individual
180
189
  creator.create("Individual", list, fitness=fitness_attr)
181
190
 
182
191
  self.toolbox = base.Toolbox()
183
- self.toolbox.register("default_individual", lambda: creator.Individual("placeholder"))
184
- self.toolbox.register("population", tools.initRepeat, list, self.toolbox.default_individual)
185
-
192
+ self.toolbox.register(
193
+ "default_individual", lambda: creator.Individual("placeholder")
194
+ )
195
+ self.toolbox.register(
196
+ "population", tools.initRepeat, list, self.toolbox.default_individual
197
+ )
198
+
186
199
  if self.enable_llm_crossover:
187
200
  self.toolbox.register("mate", self._llm_deap_crossover)
188
201
  else:
189
202
  self.toolbox.register("mate", self._deap_crossover)
190
-
203
+
191
204
  self.toolbox.register("mutate", self._deap_mutation)
192
-
205
+
193
206
  if self.enable_moo:
194
207
  self.toolbox.register("select", tools.selNSGA2)
195
208
  else:
196
- self.toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)
209
+ self.toolbox.register(
210
+ "select", tools.selTournament, tournsize=self.tournament_size
211
+ )
197
212
 
198
213
  logger.debug(
199
214
  f"Initialized EvolutionaryOptimizer with model: {model}, MOO_enabled: {self.enable_moo}, "
@@ -209,22 +224,27 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
209
224
  return self.mutation_rate
210
225
 
211
226
  # Calculate improvement rate
212
- recent_improvement = (self._best_fitness_history[-1] - self._best_fitness_history[-2]) / abs(self._best_fitness_history[-2])
213
-
227
+ recent_improvement = (
228
+ self._best_fitness_history[-1] - self._best_fitness_history[-2]
229
+ ) / abs(self._best_fitness_history[-2])
230
+
214
231
  # Calculate population diversity
215
232
  current_diversity = self._calculate_population_diversity()
216
-
233
+
217
234
  # Check for stagnation
218
235
  if recent_improvement < self.DEFAULT_RESTART_THRESHOLD:
219
236
  self._generations_without_improvement += 1
220
237
  else:
221
238
  self._generations_without_improvement = 0
222
-
239
+
223
240
  # Adjust mutation rate based on both improvement and diversity
224
241
  if self._generations_without_improvement >= self.DEFAULT_RESTART_GENERATIONS:
225
242
  # Significant stagnation - increase mutation significantly
226
243
  return min(self.mutation_rate * 2.5, self.DEFAULT_MAX_MUTATION_RATE)
227
- elif recent_improvement < 0.01 and current_diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
244
+ elif (
245
+ recent_improvement < 0.01
246
+ and current_diversity < self.DEFAULT_DIVERSITY_THRESHOLD
247
+ ):
228
248
  # Both stagnating and low diversity - increase mutation significantly
229
249
  return min(self.mutation_rate * 2.0, self.DEFAULT_MAX_MUTATION_RATE)
230
250
  elif recent_improvement < 0.01:
@@ -237,29 +257,34 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
237
257
 
238
258
  def _calculate_population_diversity(self) -> float:
239
259
  """Calculate the diversity of the current population."""
240
- if not hasattr(self, '_current_population') or not self._current_population:
260
+ if not hasattr(self, "_current_population") or not self._current_population:
241
261
  return 0.0
242
-
243
- # Calculate average Levenshtein distance between all pairs
262
+
263
+ # Calculate average Levenshtein using rapidfuzz distance between all pairs
244
264
  total_distance = 0.0
245
265
  count = 0
246
266
  for i in range(len(self._current_population)):
247
267
  for j in range(i + 1, len(self._current_population)):
248
268
  str1 = str(self._current_population[i])
249
269
  str2 = str(self._current_population[j])
250
- distance = Levenshtein.distance(str1, str2)
270
+ distance = rapidfuzz.distance.Indel.normalized_similarity(str1, str2)
251
271
  max_len = max(len(str1), len(str2))
252
272
  if max_len > 0:
253
273
  normalized_distance = distance / max_len
254
274
  total_distance += normalized_distance
255
275
  count += 1
256
-
257
- return total_distance / count if count > 0 else 0.0
258
276
 
277
+ return total_distance / count if count > 0 else 0.0
259
278
 
260
- def _deap_crossover_chunking_strategy(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
261
- chunks1 = [chunk.strip() for chunk in messages_1_str.split('.') if chunk.strip()]
262
- chunks2 = [chunk.strip() for chunk in messages_2_str.split('.') if chunk.strip()]
279
+ def _deap_crossover_chunking_strategy(
280
+ self, messages_1_str: str, messages_2_str: str
281
+ ) -> Tuple[str, str]:
282
+ chunks1 = [
283
+ chunk.strip() for chunk in messages_1_str.split(".") if chunk.strip()
284
+ ]
285
+ chunks2 = [
286
+ chunk.strip() for chunk in messages_2_str.split(".") if chunk.strip()
287
+ ]
263
288
 
264
289
  # Try chunk-level crossover if both parents have at least 2 chunks
265
290
  if len(chunks1) >= 2 and len(chunks2) >= 2:
@@ -267,18 +292,22 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
267
292
  # Crossover point is between 1 and min_num_chunks - 1
268
293
  # This requires min_num_chunks >= 2, which is already checked.
269
294
  point = random.randint(1, min_num_chunks - 1)
270
-
295
+
271
296
  child1_chunks = chunks1[:point] + chunks2[point:]
272
297
  child2_chunks = chunks2[:point] + chunks1[point:]
273
-
274
- child1_str = '. '.join(child1_chunks) + ('.' if child1_chunks else '')
275
- child2_str = '. '.join(child2_chunks) + ('.' if child2_chunks else '')
276
-
298
+
299
+ child1_str = ". ".join(child1_chunks) + ("." if child1_chunks else "")
300
+ child2_str = ". ".join(child2_chunks) + ("." if child2_chunks else "")
301
+
277
302
  return child1_str, child2_str
278
303
  else:
279
- raise ValueError("Not enough chunks in either prompt for chunk-level crossover")
280
-
281
- def _deap_crossover_word_level(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
304
+ raise ValueError(
305
+ "Not enough chunks in either prompt for chunk-level crossover"
306
+ )
307
+
308
+ def _deap_crossover_word_level(
309
+ self, messages_1_str: str, messages_2_str: str
310
+ ) -> Tuple[str, str]:
282
311
  words1 = messages_1_str.split()
283
312
  words2 = messages_2_str.split()
284
313
 
@@ -295,53 +324,54 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
295
324
  point = random.randint(1, min_word_len - 1)
296
325
  child1_words = words1[:point] + words2[point:]
297
326
  child2_words = words2[:point] + words1[point:]
298
-
299
- return ' '.join(child1_words), ' '.join(child2_words)
300
-
301
- def _deap_crossover(
302
- self,
303
- ind1: "creator.Individual",
304
- ind2: "creator.Individual"
305
- ) -> Tuple["creator.Individual", "creator.Individual"]:
327
+
328
+ return " ".join(child1_words), " ".join(child2_words)
329
+
330
+ def _deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
306
331
  """Enhanced crossover operation that preserves semantic meaning.
307
332
  Attempts chunk-level crossover first, then falls back to word-level.
308
333
  """
309
- reporting.display_message(" Recombining prompts by mixing and matching words and sentences.", verbose=self.verbose)
310
- messages_1_orig: List[Dict[Literal["role", "content"], str]] = ind1
311
- messages_2_orig: List[Dict[Literal["role", "content"], str]] = ind2
334
+ reporting.display_message(
335
+ " Recombining prompts by mixing and matching words and sentences.",
336
+ verbose=self.verbose,
337
+ )
338
+ messages_1_orig: List[Dict[str, str]] = ind1
339
+ messages_2_orig: List[Dict[str, str]] = ind2
312
340
 
313
341
  for i, message_1 in enumerate(messages_1_orig):
314
- role: str = message_1['role']
315
- message_1_str: str = message_1['content']
342
+ role: str = message_1["role"]
343
+ message_1_str: str = message_1["content"]
316
344
 
317
345
  # We check that the second message has enough AI messages and the correct role
318
- if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]['role'] == role):
346
+ if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]["role"] == role):
319
347
  message_2 = messages_2_orig[i]
320
- message_2_str: str = message_2['content']
348
+ message_2_str: str = message_2["content"]
321
349
 
322
350
  try:
323
- child1_str, child2_str = self._deap_crossover_chunking_strategy(message_1_str, message_2_str)
351
+ child1_str, child2_str = self._deap_crossover_chunking_strategy(
352
+ message_1_str, message_2_str
353
+ )
324
354
  except ValueError:
325
- child1_str, child2_str = self._deap_crossover_word_level(message_1_str, message_2_str)
326
-
355
+ child1_str, child2_str = self._deap_crossover_word_level(
356
+ message_1_str, message_2_str
357
+ )
358
+
327
359
  # Update the message content
328
- messages_1_orig[i]['content'] = child1_str
329
- messages_2_orig[i]['content'] = child2_str
360
+ messages_1_orig[i]["content"] = child1_str
361
+ messages_2_orig[i]["content"] = child2_str
330
362
  else:
331
363
  # We don't perform any crossover if there are not enough messages or the roles
332
364
  # don't match
333
365
  pass
334
-
366
+
335
367
  return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
336
368
 
337
369
  def _deap_mutation(
338
- self,
339
- individual: "creator.Individual",
340
- initial_prompt: chat_prompt.ChatPrompt
341
- ) -> "creator.Individual":
370
+ self, individual: Any, initial_prompt: chat_prompt.ChatPrompt
371
+ ) -> Any:
342
372
  """Enhanced mutation operation with multiple strategies."""
343
373
  prompt = chat_prompt.ChatPrompt(messages=individual)
344
-
374
+
345
375
  # Choose mutation strategy based on current diversity
346
376
  diversity = self._calculate_population_diversity()
347
377
 
@@ -349,54 +379,68 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
349
379
  if diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
350
380
  # Low diversity - use more aggressive mutations (higher chance for semantic)
351
381
  semantic_threshold = 0.5
352
- structural_threshold = 0.8 # semantic_threshold + 0.3
382
+ structural_threshold = 0.8 # semantic_threshold + 0.3
353
383
  else:
354
384
  # Good diversity - use more conservative mutations (higher chance for word_level)
355
385
  semantic_threshold = 0.4
356
- structural_threshold = 0.7 # semantic_threshold + 0.3
386
+ structural_threshold = 0.7 # semantic_threshold + 0.3
357
387
 
358
388
  mutation_choice = random.random()
359
389
 
360
390
  if mutation_choice > structural_threshold:
361
391
  # This corresponds to the original 'else' (word_level_mutation)
362
392
  mutated_prompt = self._word_level_mutation_prompt(prompt)
363
- reporting.display_success(" Mutation successful, prompt has been edited by randomizing words (word-level mutation).", verbose=self.verbose)
364
- return creator.Individual(mutated_prompt.formatted_messages)
393
+ reporting.display_success(
394
+ " Mutation successful, prompt has been edited by randomizing words (word-level mutation).",
395
+ verbose=self.verbose,
396
+ )
397
+ return creator.Individual(mutated_prompt.get_messages())
365
398
  elif mutation_choice > semantic_threshold:
366
399
  # This corresponds to the original 'elif' (structural_mutation)
367
400
  mutated_prompt = self._structural_mutation(prompt)
368
- reporting.display_success(" Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).", verbose=self.verbose)
369
- return creator.Individual(mutated_prompt.formatted_messages)
401
+ reporting.display_success(
402
+ " Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).",
403
+ verbose=self.verbose,
404
+ )
405
+ return creator.Individual(mutated_prompt.get_messages())
370
406
  else:
371
407
  # This corresponds to the original 'if' (semantic_mutation)
372
408
  mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
373
- reporting.display_success(" Mutation successful, prompt has been edited using an LLM (semantic mutation).", verbose=self.verbose)
374
- return creator.Individual(mutated_prompt.formatted_messages)
409
+ reporting.display_success(
410
+ " Mutation successful, prompt has been edited using an LLM (semantic mutation).",
411
+ verbose=self.verbose,
412
+ )
413
+ return creator.Individual(mutated_prompt.get_messages())
375
414
 
376
415
  def _semantic_mutation(
377
- self,
378
- prompt: chat_prompt.ChatPrompt,
379
- initial_prompt: chat_prompt.ChatPrompt
380
- ) -> chat_prompt.ChatPrompt:
416
+ self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
417
+ ) -> chat_prompt.ChatPrompt:
381
418
  """Enhanced semantic mutation with multiple strategies."""
382
419
  current_output_style_guidance = self.output_style_guidance
383
- if random.random() < 0.1:
420
+ if random.random() < 0.1:
384
421
  return self._radical_innovation_mutation(prompt, initial_prompt)
385
-
422
+
386
423
  try:
387
- strategy = random.choice([
388
- "rephrase", "simplify", "elaborate", "restructure", "focus", "increase_complexity_and_detail"
389
- ])
390
-
424
+ strategy = random.choice(
425
+ [
426
+ "rephrase",
427
+ "simplify",
428
+ "elaborate",
429
+ "restructure",
430
+ "focus",
431
+ "increase_complexity_and_detail",
432
+ ]
433
+ )
434
+
391
435
  strategy_prompts = {
392
436
  "rephrase": f"Create a different way to express the same instruction, possibly with a different length or structure, ensuring it still aims for an answer from the target LLM in the style of: '{current_output_style_guidance}'.",
393
437
  "simplify": f"Simplify the instruction while maintaining its core meaning, potentially making it more concise, to elicit an answer in the style of: '{current_output_style_guidance}'.",
394
438
  "elaborate": f"Add more relevant detail and specificity to the instruction, potentially increasing its length, but only if it helps achieve a more accurate answer from the target LLM in the style of: '{current_output_style_guidance}'.",
395
439
  "restructure": f"Change the structure of the instruction (e.g., reorder sentences, combine/split ideas) while keeping its intent, ensuring the new structure strongly guides towards an output in the style of: '{current_output_style_guidance}'.",
396
440
  "focus": f"Emphasize the key aspects of the instruction, perhaps by rephrasing or adding clarifying statements, to better elicit an answer in the style of: '{current_output_style_guidance}'.",
397
- "increase_complexity_and_detail": f"Significantly elaborate on this instruction. Add more details, examples, context, or constraints to make it more comprehensive. The goal of this elaboration is to make the prompt itself more detailed, so that it VERY CLEARLY guides the target LLM to produce a highly accurate final answer in the style of: '{current_output_style_guidance}'. The prompt can be long if needed to achieve this output style."
441
+ "increase_complexity_and_detail": f"Significantly elaborate on this instruction. Add more details, examples, context, or constraints to make it more comprehensive. The goal of this elaboration is to make the prompt itself more detailed, so that it VERY CLEARLY guides the target LLM to produce a highly accurate final answer in the style of: '{current_output_style_guidance}'. The prompt can be long if needed to achieve this output style.",
398
442
  }
399
-
443
+
400
444
  user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
401
445
  Task context: {self._get_task_description_for_llm(initial_prompt)}
402
446
  Desired output style from target LLM: '{current_output_style_guidance}'
@@ -405,46 +449,57 @@ Return only the modified prompt message list, nothing else. Make sure to return
405
449
  """
406
450
  response = self._call_model(
407
451
  messages=[
408
- {"role": "system", "content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided."},
409
- {"role": "user", "content": user_prompt_for_semantic_mutation}
452
+ {
453
+ "role": "system",
454
+ "content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided.",
455
+ },
456
+ {"role": "user", "content": user_prompt_for_semantic_mutation},
410
457
  ],
411
- is_reasoning=True
458
+ is_reasoning=True,
412
459
  )
413
460
 
414
461
  return chat_prompt.ChatPrompt(messages=utils.json_to_dict(response.strip()))
415
462
  except Exception as e:
416
- reporting.display_error(f" Error in semantic mutation, this is usually a parsing error: {e}", verbose=self.verbose)
463
+ reporting.display_error(
464
+ f" Error in semantic mutation, this is usually a parsing error: {e}",
465
+ verbose=self.verbose,
466
+ )
417
467
  return prompt
418
468
 
419
469
  def _structural_mutation(
420
- self,
421
- prompt: chat_prompt.ChatPrompt
422
- ) -> chat_prompt.ChatPrompt:
470
+ self, prompt: chat_prompt.ChatPrompt
471
+ ) -> chat_prompt.ChatPrompt:
423
472
  """Perform structural mutation (reordering, combining, splitting)."""
424
- mutated_messages: List[Dict[Literal["role", "content"], str]] = []
473
+ mutated_messages: List[Dict[str, str]] = []
425
474
 
426
- for message in prompt.formatted_messages:
475
+ for message in prompt.get_messages():
427
476
  content = message["content"]
428
477
  role = message["role"]
429
478
 
430
- sentences = [s.strip() for s in content.split('.') if s.strip()]
479
+ sentences = [s.strip() for s in content.split(".") if s.strip()]
431
480
  if len(sentences) <= 1:
432
- mutated_messages.append({"role": role, "content": self._word_level_mutation(content)})
481
+ mutated_messages.append(
482
+ {"role": role, "content": self._word_level_mutation(content)}
483
+ )
433
484
  continue
434
-
485
+
435
486
  mutation_type = random.random()
436
487
  if mutation_type < 0.3:
437
488
  # Reorder sentences
438
489
  random.shuffle(sentences)
439
- mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
490
+ mutated_messages.append(
491
+ {"role": role, "content": ". ".join(sentences) + "."}
492
+ )
440
493
  continue
441
494
  elif mutation_type < 0.6:
442
495
  # Combine adjacent sentences
443
496
  if len(sentences) >= 2:
444
497
  idx = random.randint(0, len(sentences) - 2)
445
- combined = sentences[idx] + ' and ' + sentences[idx + 1]
446
- sentences[idx:idx+2] = [combined]
447
- mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
498
+ combined = sentences[idx] + " and " + sentences[idx + 1]
499
+ sentences[idx : idx + 2] = [combined]
500
+ mutated_messages.append(
501
+ {"role": role, "content": ". ".join(sentences) + "."}
502
+ )
448
503
  continue
449
504
  else:
450
505
  # Split a sentence
@@ -452,33 +507,45 @@ Return only the modified prompt message list, nothing else. Make sure to return
452
507
  words = sentences[idx].split()
453
508
  if len(words) > 3:
454
509
  split_point = random.randint(2, len(words) - 2)
455
- sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
456
- mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
510
+ sentences[idx : idx + 1] = [
511
+ " ".join(words[:split_point]),
512
+ " ".join(words[split_point:]),
513
+ ]
514
+ mutated_messages.append(
515
+ {"role": role, "content": ". ".join(sentences) + "."}
516
+ )
457
517
  continue
458
518
  else:
459
519
  mutated_messages.append({"role": role, "content": content})
460
520
 
461
521
  return chat_prompt.ChatPrompt(messages=mutated_messages)
462
522
 
463
- def _word_level_mutation_prompt(self, prompt: chat_prompt.ChatPrompt) -> chat_prompt.ChatPrompt:
464
- mutated_messages: List[Dict[Literal['role', 'content'], str]] = []
465
- for message in prompt.formatted_messages:
466
- mutated_messages.append({"role": message["role"], "content": self._word_level_mutation(message["content"])})
523
+ def _word_level_mutation_prompt(
524
+ self, prompt: chat_prompt.ChatPrompt
525
+ ) -> chat_prompt.ChatPrompt:
526
+ mutated_messages: List[Dict[str, str]] = []
527
+ for message in prompt.get_messages():
528
+ mutated_messages.append(
529
+ {
530
+ "role": message["role"],
531
+ "content": self._word_level_mutation(message["content"]),
532
+ }
533
+ )
467
534
  return chat_prompt.ChatPrompt(messages=mutated_messages)
468
-
535
+
469
536
  def _word_level_mutation(self, msg_content: str) -> str:
470
537
  """Perform word-level mutation."""
471
538
  words = msg_content.split()
472
539
  if len(words) <= 1:
473
540
  return msg_content
474
-
541
+
475
542
  mutation_type = random.random()
476
- if mutation_type < 0.3:
543
+ if mutation_type < 0.3:
477
544
  # Word replacement
478
545
  idx = random.randint(0, len(words) - 1)
479
546
  words[idx] = self._get_synonym(words[idx])
480
547
  elif mutation_type < 0.6:
481
- # Word reordering
548
+ # Word reordering
482
549
  if len(words) > 2:
483
550
  i, j = random.sample(range(len(words)), 2)
484
551
  words[i], words[j] = words[j], words[i]
@@ -486,39 +553,45 @@ Return only the modified prompt message list, nothing else. Make sure to return
486
553
  # Phrase modification
487
554
  idx = random.randint(0, len(words) - 1)
488
555
  words[idx] = self._modify_phrase(words[idx])
489
-
490
- return ' '.join(words)
491
556
 
492
- def _get_synonym(
493
- self,
494
- word: str
495
- ) -> str:
557
+ return " ".join(words)
558
+
559
+ def _get_synonym(self, word: str) -> str:
496
560
  """Get a synonym for a word using LLM."""
497
561
  try:
498
562
  response = self._call_model(
499
563
  messages=[
500
- {"role": "system", "content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text."},
501
- {"role": "user", "content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else."}
564
+ {
565
+ "role": "system",
566
+ "content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text.",
567
+ },
568
+ {
569
+ "role": "user",
570
+ "content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else.",
571
+ },
502
572
  ],
503
- is_reasoning=True
573
+ is_reasoning=True,
504
574
  )
505
575
  return response.strip()
506
576
  except Exception as e:
507
577
  logger.warning(f"Error getting synonym for '{word}': {e}")
508
578
  return word
509
579
 
510
- def _modify_phrase(
511
- self,
512
- phrase: str
513
- ) -> str:
580
+ def _modify_phrase(self, phrase: str) -> str:
514
581
  """Modify a phrase while preserving meaning using LLM."""
515
582
  try:
516
583
  response = self._call_model(
517
584
  messages=[
518
- {"role": "system", "content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text."},
519
- {"role": "user", "content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else."}
585
+ {
586
+ "role": "system",
587
+ "content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text.",
588
+ },
589
+ {
590
+ "role": "user",
591
+ "content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else.",
592
+ },
520
593
  ],
521
- is_reasoning=True
594
+ is_reasoning=True,
522
595
  )
523
596
  return response.strip()
524
597
  except Exception as e:
@@ -526,47 +599,53 @@ Return only the modified prompt message list, nothing else. Make sure to return
526
599
  return phrase
527
600
 
528
601
  def _radical_innovation_mutation(
529
- self,
530
- prompt: chat_prompt.ChatPrompt,
531
- initial_prompt: chat_prompt.ChatPrompt
532
- ) -> chat_prompt.ChatPrompt:
602
+ self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
603
+ ) -> chat_prompt.ChatPrompt:
533
604
  """Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
534
- logger.debug(f"Attempting radical innovation for prompt: {json.dumps(prompt.formatted_messages)[:70]}...")
605
+ logger.debug(
606
+ f"Attempting radical innovation for prompt: {json.dumps(prompt.get_messages())[:70]}..."
607
+ )
535
608
  task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
536
609
  current_output_style_guidance = self.output_style_guidance
537
-
610
+
538
611
  user_prompt_for_radical_innovation = f"""Task Context:
539
612
  {task_desc_for_llm}
540
613
  Desired output style from target LLM: '{current_output_style_guidance}'
541
614
 
542
615
  Existing Prompt (which may be underperforming):
543
- '''{prompt.formatted_messages}'''
616
+ '''{prompt.get_messages()}'''
544
617
 
545
- Please generate a new, significantly improved, and potentially very different prompt for this task.
618
+ Please generate a new, significantly improved, and potentially very different prompt for this task.
546
619
  Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
547
620
  Return only the new prompt list object.
548
621
  """
549
622
  try:
550
623
  new_prompt_str = self._call_model(
551
624
  messages=[
552
- {"role": "system", "content": self._get_radical_innovation_system_prompt()},
553
- {"role": "user", "content": user_prompt_for_radical_innovation}
625
+ {
626
+ "role": "system",
627
+ "content": self._get_radical_innovation_system_prompt(),
628
+ },
629
+ {"role": "user", "content": user_prompt_for_radical_innovation},
554
630
  ],
555
- is_reasoning=True
631
+ is_reasoning=True,
632
+ )
633
+ logger.info(
634
+ f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.get_messages())[:70]}..."
556
635
  )
557
- logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.formatted_messages)[:70]}...")
558
636
  return chat_prompt.ChatPrompt(messages=json.loads(new_prompt_str))
559
637
  except Exception as e:
560
- logger.warning(f"Radical innovation mutation failed for prompt '{json.dumps(prompt.formatted_messages)[:50]}...': {e}. Returning original.")
638
+ logger.warning(
639
+ f"Radical innovation mutation failed for prompt '{json.dumps(prompt.get_messages())[:50]}...': {e}. Returning original."
640
+ )
561
641
  return prompt
562
642
 
563
643
  def _initialize_population(
564
- self,
565
- prompt: chat_prompt.ChatPrompt
644
+ self, prompt: chat_prompt.ChatPrompt
566
645
  ) -> List[chat_prompt.ChatPrompt]:
567
- """Initialize the population with diverse variations of the initial prompt,
568
- including some 'fresh start' prompts based purely on task description.
569
- All generated prompts should aim to elicit answers matching self.output_style_guidance.
646
+ """Initialize the population with diverse variations of the initial prompt,
647
+ including some 'fresh start' prompts based purely on task description.
648
+ All generated prompts should aim to elicit answers matching self.output_style_guidance.
570
649
  """
571
650
  with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
572
651
  init_pop_report.start(self.population_size)
@@ -593,59 +672,75 @@ Return only the new prompt list object.
593
672
 
594
673
  Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
595
674
  Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
596
-
675
+
597
676
  Example of valid response: [
598
677
  ["role": "<role>", "content": "<Prompt targeting specified style.>"],
599
678
  ["role": "<role>", "content": "<Another prompt designed for the output style.>"]
600
679
  ]
601
680
 
602
681
  Your response MUST be a valid JSON list of AI messages. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
603
-
682
+
604
683
  """
605
684
  try:
606
685
  response_content = self._call_model(
607
686
  messages=[
608
- {"role": "system", "content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings."},
609
- {"role": "user", "content": fresh_start_user_prompt}
687
+ {
688
+ "role": "system",
689
+ "content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
690
+ },
691
+ {"role": "user", "content": fresh_start_user_prompt},
610
692
  ],
611
- is_reasoning=True
693
+ is_reasoning=True,
612
694
  )
613
-
614
- logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
615
-
695
+
696
+ logger.debug(
697
+ f"Raw LLM response for fresh start prompts: {response_content}"
698
+ )
699
+
616
700
  fresh_prompts = utils.json_to_dict(response_content)
617
701
  if isinstance(fresh_prompts, list):
618
- if all(isinstance(p, dict) for p in fresh_prompts) and all(p.get("role") is not None for p in fresh_prompts):
619
- population.append(chat_prompt.ChatPrompt(messages=fresh_prompts))
702
+ if all(isinstance(p, dict) for p in fresh_prompts) and all(
703
+ p.get("role") is not None for p in fresh_prompts
704
+ ):
705
+ population.append(
706
+ chat_prompt.ChatPrompt(messages=fresh_prompts)
707
+ )
620
708
  init_pop_report.success_fresh_prompts(1)
621
709
  elif all(isinstance(p, list) for p in fresh_prompts):
622
- population.extend([chat_prompt.ChatPrompt(messages=p) for p in fresh_prompts[:num_fresh_starts]])
623
- init_pop_report.success_fresh_prompts(len(fresh_prompts[:num_fresh_starts]))
710
+ population.extend(
711
+ [
712
+ chat_prompt.ChatPrompt(messages=p)
713
+ for p in fresh_prompts[:num_fresh_starts]
714
+ ]
715
+ )
716
+ init_pop_report.success_fresh_prompts(
717
+ len(fresh_prompts[:num_fresh_starts])
718
+ )
624
719
  else:
625
720
  init_pop_report.failed_fresh_prompts(
626
721
  num_fresh_starts,
627
- f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts."
722
+ f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts.",
628
723
  )
629
724
  except json.JSONDecodeError as e_json:
630
725
  init_pop_report.failed_fresh_prompts(
631
726
  num_fresh_starts,
632
- f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts."
727
+ f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts.",
633
728
  )
634
729
  except Exception as e:
635
730
  init_pop_report.failed_fresh_prompts(
636
731
  num_fresh_starts,
637
- f"Error generating fresh start prompts: {e}. Skipping fresh start prompts."
732
+ f"Error generating fresh start prompts: {e}. Skipping fresh start prompts.",
638
733
  )
639
734
 
640
735
  # Generate variations on the initial prompt for the remaining slots
641
736
  # TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
642
737
  if num_variations_on_initial > 0:
643
738
  init_pop_report.start_variations(num_variations_on_initial)
644
-
739
+
645
740
  # TODO: We need to split this into batches as the model will not return enough tokens
646
741
  # to generate all the candidates
647
742
  user_prompt_for_variation = f"""Initial prompt:
648
- '''{prompt.formatted_messages}'''
743
+ '''{prompt.get_messages()}'''
649
744
 
650
745
  Task context:
651
746
  {task_desc_for_llm}
@@ -672,36 +767,61 @@ Return only the new prompt list object.
672
767
  try:
673
768
  response_content_variations = self._call_model(
674
769
  messages=[
675
- {"role": "system", "content": self._get_reasoning_system_prompt_for_variation()},
676
- {"role": "user", "content": user_prompt_for_variation}
770
+ {
771
+ "role": "system",
772
+ "content": self._get_reasoning_system_prompt_for_variation(),
773
+ },
774
+ {"role": "user", "content": user_prompt_for_variation},
677
775
  ],
678
- is_reasoning=True
776
+ is_reasoning=True,
777
+ )
778
+ logger.debug(
779
+ f"Raw response for population variations: {response_content_variations}"
679
780
  )
680
- logger.debug(f"Raw response for population variations: {response_content_variations}")
681
781
  json_response_variations = json.loads(response_content_variations)
682
- generated_prompts_variations = [p["prompt"] for p in json_response_variations.get("prompts", []) if isinstance(p, dict) and "prompt" in p]
683
-
782
+ generated_prompts_variations = [
783
+ p["prompt"]
784
+ for p in json_response_variations.get("prompts", [])
785
+ if isinstance(p, dict) and "prompt" in p
786
+ ]
787
+
684
788
  if generated_prompts_variations:
685
- init_pop_report.success_variations(len(generated_prompts_variations[:num_variations_on_initial]))
686
- population.extend([chat_prompt.ChatPrompt(messages=p) for p in generated_prompts_variations[:num_variations_on_initial]])
789
+ init_pop_report.success_variations(
790
+ len(
791
+ generated_prompts_variations[:num_variations_on_initial]
792
+ )
793
+ )
794
+ population.extend(
795
+ [
796
+ chat_prompt.ChatPrompt(messages=p)
797
+ for p in generated_prompts_variations[
798
+ :num_variations_on_initial
799
+ ]
800
+ ]
801
+ )
687
802
  else:
688
- init_pop_report.failed_variations(num_variations_on_initial, "Could not parse 'prompts' list for variations. Skipping variations.")
803
+ init_pop_report.failed_variations(
804
+ num_variations_on_initial,
805
+ "Could not parse 'prompts' list for variations. Skipping variations.",
806
+ )
689
807
  except Exception as e:
690
- init_pop_report.failed_variations(num_variations_on_initial, f"Error calling LLM for initial population variations: {e}")
808
+ init_pop_report.failed_variations(
809
+ num_variations_on_initial,
810
+ f"Error calling LLM for initial population variations: {e}",
811
+ )
691
812
 
692
813
  # Ensure population is of the required size using unique prompts
693
814
  # TODO Test with levenshtein distance
694
815
  final_population_set: Set[str] = set()
695
816
  final_population_list: List[chat_prompt.ChatPrompt] = []
696
817
  for p in population:
697
- if json.dumps(p.formatted_messages) not in final_population_set:
698
- final_population_set.add(json.dumps(p.formatted_messages))
818
+ if json.dumps(p.get_messages()) not in final_population_set:
819
+ final_population_set.add(json.dumps(p.get_messages()))
699
820
  final_population_list.append(p)
700
-
821
+
701
822
  init_pop_report.end(final_population_list)
702
823
  # Return exactly population_size prompts if possible, or fewer if generation failed badly.
703
- return final_population_list[:self.population_size]
704
-
824
+ return final_population_list[: self.population_size]
705
825
 
706
826
  def _should_restart_population(self, curr_best: float) -> bool:
707
827
  """
@@ -709,7 +829,9 @@ Return only the new prompt list object.
709
829
  a population restart based on lack of improvement.
710
830
  """
711
831
  if self._best_primary_score_history:
712
- threshold = self._best_primary_score_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD)
832
+ threshold = self._best_primary_score_history[-1] * (
833
+ 1 + self.DEFAULT_RESTART_THRESHOLD
834
+ )
713
835
  if curr_best < threshold:
714
836
  self._gens_since_pop_improvement += 1
715
837
  else:
@@ -720,9 +842,9 @@ Return only the new prompt list object.
720
842
  def _restart_population(
721
843
  self,
722
844
  hof: tools.HallOfFame,
723
- population: list["creator.Individual"],
845
+ population: List[Any],
724
846
  best_prompt_so_far: chat_prompt.ChatPrompt,
725
- ) -> list["creator.Individual"]:
847
+ ) -> List[Any]:
726
848
  """Return a fresh, evaluated population seeded by elites."""
727
849
  if self.enable_moo:
728
850
  elites = list(hof)
@@ -730,12 +852,15 @@ Return only the new prompt list object.
730
852
  elites = tools.selBest(population, self.elitism_size)
731
853
 
732
854
  seed_prompt = (
733
- chat_prompt.ChatPrompt(messages=max(elites, key=lambda x: x.fitness.values[0]))
734
- if elites else best_prompt_so_far
855
+ chat_prompt.ChatPrompt(
856
+ messages=max(elites, key=lambda x: x.fitness.values[0])
857
+ )
858
+ if elites
859
+ else best_prompt_so_far
735
860
  )
736
861
 
737
862
  prompt_variants = self._initialize_population(seed_prompt)
738
- new_pop = [creator.Individual(p.formatted_messages) for p in prompt_variants]
863
+ new_pop = [creator.Individual(p.get_messages()) for p in prompt_variants]
739
864
 
740
865
  for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
741
866
  ind.fitness.values = fit
@@ -746,12 +871,12 @@ Return only the new prompt list object.
746
871
  def _run_generation(
747
872
  self,
748
873
  generation_idx: int,
749
- population: list["creator.Individual"],
874
+ population: List[Any],
750
875
  prompt: chat_prompt.ChatPrompt,
751
876
  hof: tools.HallOfFame,
752
877
  report: Any,
753
878
  best_primary_score_overall: float,
754
- ) -> tuple[list["creator.Individual"], int]:
879
+ ) -> tuple[List[Any], int]:
755
880
  """Execute mating, mutation, evaluation and HoF update."""
756
881
  best_gen_score = 0.0
757
882
 
@@ -760,20 +885,23 @@ Return only the new prompt list object.
760
885
  offspring = self.toolbox.select(population, self.population_size)
761
886
  else:
762
887
  elites = tools.selBest(population, self.elitism_size)
763
- rest = self.toolbox.select(population, len(population) - self.elitism_size)
888
+ rest = self.toolbox.select(population, len(population) - self.elitism_size)
764
889
  offspring = elites + rest
765
890
 
766
891
  # --- crossover -------------------------------------------------
767
892
  report.performing_crossover()
768
893
  offspring = list(map(self.toolbox.clone, offspring))
769
894
  for i in range(0, len(offspring), 2):
770
- if i+1 < len(offspring):
771
- c1, c2 = offspring[i], offspring[i+1]
895
+ if i + 1 < len(offspring):
896
+ c1, c2 = offspring[i], offspring[i + 1]
772
897
  if random.random() < self.crossover_rate:
773
898
  c1_new, c2_new = self.toolbox.mate(c1, c2)
774
- offspring[i], offspring[i+1] = c1_new, c2_new
775
- del offspring[i].fitness.values, offspring[i+1].fitness.values
776
- reporting.display_success(" Crossover successful, prompts have been combined and edited.\n│", verbose=self.verbose)
899
+ offspring[i], offspring[i + 1] = c1_new, c2_new
900
+ del offspring[i].fitness.values, offspring[i + 1].fitness.values
901
+ reporting.display_success(
902
+ " Crossover successful, prompts have been combined and edited.\n│",
903
+ verbose=self.verbose,
904
+ )
777
905
 
778
906
  # --- mutation --------------------------------------------------
779
907
  report.performing_mutation()
@@ -785,27 +913,40 @@ Return only the new prompt list object.
785
913
  offspring[i] = new_ind
786
914
  del offspring[i].fitness.values
787
915
  n_mutations += 1
788
- reporting.display_success(f" Mutation successful, {n_mutations} prompts have been edited.\n│", verbose=self.verbose)
789
-
916
+ reporting.display_success(
917
+ f" Mutation successful, {n_mutations} prompts have been edited.\n│",
918
+ verbose=self.verbose,
919
+ )
920
+
790
921
  # --- evaluation ------------------------------------------------
791
922
  invalid = [ind for ind in offspring if not ind.fitness.valid]
792
923
  report.performing_evaluation(len(invalid))
793
924
  for ind_idx, ind in enumerate(invalid):
794
925
  fit = self.toolbox.evaluate(ind)
795
- ind.fitness.values = fit
926
+ if self.enable_moo:
927
+ ind.fitness.values = fit
928
+ else:
929
+ ind.fitness.values = tuple([fit[0]])
796
930
  best_gen_score = max(best_gen_score, fit[0])
797
931
 
798
932
  report.performed_evaluation(ind_idx, ind.fitness.values[0])
799
-
933
+
800
934
  # --- update HoF & reporter ------------------------------------
801
935
  hof.update(offspring)
802
- reporting.end_gen(generation_idx, best_gen_score, best_primary_score_overall, verbose=self.verbose)
803
-
936
+ reporting.end_gen(
937
+ generation_idx,
938
+ best_gen_score,
939
+ best_primary_score_overall,
940
+ verbose=self.verbose,
941
+ )
942
+
804
943
  return offspring, len(invalid)
805
944
 
806
- def _population_best_score(self, population: List["creator.Individual"]) -> float:
945
+ def _population_best_score(self, population: List[Any]) -> float:
807
946
  """Return highest primary-objective score among *valid* individuals."""
808
- valid_scores = [ind.fitness.values[0] for ind in population if ind.fitness.valid]
947
+ valid_scores = [
948
+ ind.fitness.values[0] for ind in population if ind.fitness.valid
949
+ ]
809
950
  return max(valid_scores, default=0.0)
810
951
 
811
952
  def optimize_prompt(
@@ -816,7 +957,8 @@ Return only the new prompt list object.
816
957
  experiment_config: Optional[Dict] = None,
817
958
  n_samples: Optional[int] = None,
818
959
  auto_continue: bool = False,
819
- **kwargs,
960
+ agent_class: Optional[Type[OptimizableAgent]] = None,
961
+ **kwargs: Any,
820
962
  ) -> OptimizationResult:
821
963
  """
822
964
  Args:
@@ -828,9 +970,51 @@ Return only the new prompt list object.
828
970
  auto_continue: Whether to automatically continue optimization
829
971
  **kwargs: Additional keyword arguments
830
972
  """
831
- reporting.display_header(self.__class__.__name__, verbose=self.verbose)
973
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
974
+ raise ValueError("Prompt must be a ChatPrompt object")
975
+
976
+ if not isinstance(dataset, opik.Dataset):
977
+ raise ValueError("Dataset must be a Dataset object")
978
+
979
+ if not callable(metric):
980
+ raise ValueError(
981
+ "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
982
+ )
983
+
984
+ if prompt.model is None:
985
+ prompt.model = self.model
986
+ if prompt.model_kwargs is None:
987
+ prompt.model_kwargs = self.model_kwargs
988
+
989
+ if agent_class is None:
990
+ self.agent_class = utils.create_litellm_agent_class(prompt)
991
+ else:
992
+ self.agent_class = agent_class
993
+
994
+ self.project_name = self.agent_class.project_name
995
+
996
+ # Step 0. Start Opik optimization run
997
+ opik_optimization_run: Optional[optimization.Optimization] = None
998
+ try:
999
+ opik_optimization_run = self._opik_client.create_optimization(
1000
+ dataset_name=dataset.name,
1001
+ objective_name=metric.__name__,
1002
+ metadata={"optimizer": self.__class__.__name__},
1003
+ )
1004
+ self._current_optimization_id = opik_optimization_run.id
1005
+ except Exception as e:
1006
+ logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
1007
+ self._current_optimization_id = None
1008
+
1009
+ reporting.display_header(
1010
+ algorithm=self.__class__.__name__,
1011
+ optimization_id=self._current_optimization_id,
1012
+ dataset_id=dataset.id,
1013
+ verbose=self.verbose,
1014
+ )
1015
+
832
1016
  reporting.display_configuration(
833
- prompt.formatted_messages,
1017
+ prompt.get_messages(),
834
1018
  {
835
1019
  "optimizer": f"{ 'DEAP MOO' if self.enable_moo else 'DEAP SO' } Evolutionary Optimization",
836
1020
  "population_size": self.population_size,
@@ -838,79 +1022,79 @@ Return only the new prompt list object.
838
1022
  "mutation_rate": self.mutation_rate,
839
1023
  "crossover_rate": self.crossover_rate,
840
1024
  },
841
- verbose=self.verbose
1025
+ verbose=self.verbose,
842
1026
  )
843
1027
 
1028
+ # Step 1. Step variables and define fitness function
844
1029
  self.llm_call_counter = 0
845
- self._history = []
846
- self._current_optimization_id = None
1030
+ self._history: List[OptimizationRound] = []
847
1031
  self._current_generation = 0
848
1032
  self._best_fitness_history = []
849
1033
  self._generations_without_improvement = 0
850
- self._llm_cache.clear()
851
1034
  self._current_population = []
852
1035
  self._generations_without_overall_improvement = 0
853
-
854
- # Step 0. Define fitness function
1036
+
855
1037
  if self.enable_moo:
1038
+
856
1039
  def _deap_evaluate_individual_fitness(
857
- messages: List[Dict[str, str]]
858
- ) -> Tuple[float, float]:
859
- primary_fitness_score: float = self.evaluate_prompt(
860
- prompt=chat_prompt.ChatPrompt(messages=messages),
1040
+ messages: List[Dict[str, str]],
1041
+ ) -> Tuple[float, float]:
1042
+ primary_fitness_score: float = self._evaluate_prompt(
1043
+ prompt,
1044
+ messages, # type: ignore
861
1045
  dataset=dataset,
862
1046
  metric=metric,
863
1047
  n_samples=n_samples,
864
1048
  experiment_config=(experiment_config or {}).copy(),
865
1049
  optimization_id=self._current_optimization_id,
866
- verbose=0
1050
+ verbose=0,
867
1051
  )
868
1052
  prompt_length = float(len(str(json.dumps(messages))))
869
1053
  return (primary_fitness_score, prompt_length)
1054
+
870
1055
  else:
871
1056
  # Single-objective
872
1057
  def _deap_evaluate_individual_fitness(
873
- messages: List[Dict[str, str]]
874
- ) -> Tuple[float,]:
875
- fitness_score: float = self.evaluate_prompt(
876
- prompt=chat_prompt.ChatPrompt(messages=messages),
1058
+ messages: List[Dict[str, str]],
1059
+ ) -> Tuple[float, float]:
1060
+ fitness_score: float = self._evaluate_prompt(
1061
+ prompt,
1062
+ messages, # type: ignore
877
1063
  dataset=dataset,
878
1064
  metric=metric,
879
1065
  n_samples=n_samples,
880
1066
  experiment_config=(experiment_config or {}).copy(),
881
1067
  optimization_id=self._current_optimization_id,
882
- verbose=0
1068
+ verbose=0,
883
1069
  )
884
- return (fitness_score,)
1070
+ return (fitness_score, 0.0)
1071
+
885
1072
  self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
886
1073
 
887
- # Step 1. Start Opik optimization run
888
- opik_optimization_run: Optional[optimization.Optimization] = None
889
- try:
890
- opik_optimization_run: optimization.Optimization = self._opik_client.create_optimization(
891
- dataset_name=dataset.name,
892
- objective_name=metric.__name__,
893
- metadata={"optimizer": self.__class__.__name__},
1074
+ # Step 2. Compute the initial performance of the prompt
1075
+ with reporting.baseline_performance(
1076
+ verbose=self.verbose
1077
+ ) as report_baseline_performance:
1078
+ initial_eval_result = _deap_evaluate_individual_fitness(
1079
+ prompt.get_messages()
1080
+ ) # type: ignore
1081
+ initial_primary_score = initial_eval_result[0]
1082
+ initial_length = (
1083
+ initial_eval_result[1]
1084
+ if self.enable_moo
1085
+ else float(len(json.dumps(prompt.get_messages())))
894
1086
  )
895
- self._current_optimization_id = opik_optimization_run.id
896
- logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
897
- except Exception as e:
898
- logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
899
1087
 
900
- # Step 2. Compute the initial performance of the prompt
901
- with reporting.baseline_performance(verbose=self.verbose) as report_baseline_performance:
902
- initial_eval_result: Tuple[float, float] | Tuple[float, ] = _deap_evaluate_individual_fitness(prompt.formatted_messages)
903
- initial_primary_score: float = initial_eval_result[0]
904
- initial_length: float = initial_eval_result[1] if self.enable_moo else float(len(json.dumps(prompt.formatted_messages)))
905
-
906
- best_primary_score_overall: float = initial_primary_score
1088
+ best_primary_score_overall = initial_primary_score
907
1089
  best_prompt_overall = prompt
908
1090
  report_baseline_performance.set_score(initial_primary_score)
909
-
1091
+
910
1092
  # Step 3. Define the output style guide
911
1093
  effective_output_style_guidance = self.output_style_guidance
912
- if self.infer_output_style and \
913
- (self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
1094
+ if self.infer_output_style and (
1095
+ self.output_style_guidance is None
1096
+ or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE
1097
+ ):
914
1098
  # If user wants inference AND hasn't provided a specific custom guidance
915
1099
  inferred_style = self._infer_output_style_from_dataset(dataset, prompt)
916
1100
  if inferred_style:
@@ -918,22 +1102,26 @@ Return only the new prompt list object.
918
1102
  # Update self.output_style_guidance for this run so dynamic prompt methods use it
919
1103
  self.output_style_guidance = inferred_style
920
1104
  else:
921
- logger.warning("Failed to infer output style, using default or user-provided guidance.")
1105
+ logger.warning(
1106
+ "Failed to infer output style, using default or user-provided guidance."
1107
+ )
922
1108
 
923
1109
  # Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
924
1110
  # (It might have been None if user passed None and infer_output_style was False)
925
1111
  if self.output_style_guidance is None:
926
1112
  # Fallback if still None
927
1113
  self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
928
-
1114
+
929
1115
  # Step 4. Initialize population
930
1116
  initial_prompts: List[chat_prompt.ChatPrompt] = self._initialize_population(
931
1117
  prompt=prompt
932
1118
  )
933
-
934
- deap_population = [creator.Individual(p.formatted_messages) for p in initial_prompts]
935
- deap_population = deap_population[:self.population_size]
936
-
1119
+
1120
+ deap_population = [
1121
+ creator.Individual(p.get_messages()) for p in initial_prompts
1122
+ ]
1123
+ deap_population = deap_population[: self.population_size]
1124
+
937
1125
  # Step 5. Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
938
1126
  if self.enable_moo:
939
1127
  hof = tools.ParetoFront()
@@ -942,44 +1130,72 @@ Return only the new prompt list object.
942
1130
  hof = tools.HallOfFame(self.DEFAULT_HALL_OF_FAME_SIZE)
943
1131
 
944
1132
  # Step 6. Evaluate the initial population
945
- with reporting.evaluate_initial_population(verbose=self.verbose) as report_initial_population:
946
- fitnesses: List[float] = list(map(self.toolbox.evaluate, deap_population))
947
- _best_score = max(best_primary_score_overall, max([x[0] for x in fitnesses]))
1133
+ with reporting.evaluate_initial_population(
1134
+ verbose=self.verbose
1135
+ ) as report_initial_population:
1136
+ fitnesses: List[Any] = list(map(self.toolbox.evaluate, deap_population))
1137
+ _best_score = max(
1138
+ best_primary_score_overall, max([x[0] for x in fitnesses])
1139
+ )
948
1140
 
949
- for i, ind, fit in zip(range(len(deap_population)), deap_population, fitnesses):
950
- ind.fitness.values = fit
1141
+ for i, ind, fit in zip(
1142
+ range(len(deap_population)), deap_population, fitnesses
1143
+ ):
1144
+ if self.enable_moo:
1145
+ ind.fitness.values = fit
1146
+ else:
1147
+ ind.fitness.values = tuple([fit[0]])
951
1148
  report_initial_population.set_score(i, fit[0], _best_score)
952
-
1149
+
953
1150
  hof.update(deap_population)
954
-
1151
+
955
1152
  if hof and len(hof) > 0:
956
1153
  if self.enable_moo:
957
- current_best_for_primary: creator.Individual = max(hof, key=lambda ind: ind.fitness.values[0])
958
- best_primary_score_overall: float = current_best_for_primary.fitness.values[0]
959
- best_prompt_overall = chat_prompt.ChatPrompt(messages=current_best_for_primary)
1154
+ current_best_for_primary: Any = max(
1155
+ hof, key=lambda ind: ind.fitness.values[0]
1156
+ )
1157
+ best_primary_score_overall = current_best_for_primary.fitness.values[0]
1158
+ best_prompt_overall = chat_prompt.ChatPrompt(
1159
+ messages=current_best_for_primary
1160
+ )
960
1161
  else:
961
1162
  # Single-objective
962
1163
  current_best_on_front = hof[0]
963
- best_primary_score_overall: float = current_best_on_front.fitness.values[0]
964
-
1164
+ best_primary_score_overall = current_best_on_front.fitness.values[0]
1165
+ best_prompt_overall = chat_prompt.ChatPrompt(
1166
+ messages=current_best_on_front
1167
+ )
1168
+
965
1169
  if self.enable_moo:
966
- logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {json.dumps(best_prompt_overall.formatted_messages)[:100]}...")
1170
+ logger.info(
1171
+ f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {json.dumps(best_prompt_overall.get_messages())[:100]}..."
1172
+ )
967
1173
  else:
968
- logger.info(f"Gen {0}: New best score: {best_primary_score_overall:.4f}")
1174
+ logger.info(
1175
+ f"Gen {0}: New best score: {best_primary_score_overall:.4f}"
1176
+ )
969
1177
 
970
1178
  # Simplified history logging for this transition
971
1179
  initial_round_data = OptimizationRound(
972
1180
  round_number=0,
973
- current_prompt=best_prompt_overall, # Representative best
1181
+ current_prompt=best_prompt_overall, # Representative best
974
1182
  current_score=best_primary_score_overall,
975
- generated_prompts=[{"prompt": best_prompt_overall, "score": best_primary_score_overall, "trial_scores": [best_primary_score_overall]}],
1183
+ generated_prompts=[
1184
+ {
1185
+ "prompt": best_prompt_overall,
1186
+ "score": best_primary_score_overall,
1187
+ "trial_scores": [best_primary_score_overall],
1188
+ }
1189
+ ],
976
1190
  best_prompt=best_prompt_overall,
977
1191
  best_score=best_primary_score_overall,
978
- improvement=0.0
979
- ).dict()
1192
+ improvement=0.0,
1193
+ )
980
1194
  self._add_to_history(initial_round_data)
981
1195
 
982
- with reporting.start_evolutionary_algo(verbose=self.verbose) as report_evolutionary_algo:
1196
+ with reporting.start_evolutionary_algo(
1197
+ verbose=self.verbose
1198
+ ) as report_evolutionary_algo:
983
1199
  for generation_idx in range(1, self.num_generations + 1):
984
1200
  report_evolutionary_algo.start_gen(generation_idx, self.num_generations)
985
1201
 
@@ -987,21 +1203,30 @@ Return only the new prompt list object.
987
1203
 
988
1204
  # ---------- restart logic -------------------------------------
989
1205
  if self._should_restart_population(curr_best_score):
990
- report_evolutionary_algo.restart_population(self.DEFAULT_RESTART_GENERATIONS)
1206
+ report_evolutionary_algo.restart_population(
1207
+ self.DEFAULT_RESTART_GENERATIONS
1208
+ )
991
1209
  deap_population = self._restart_population(
992
1210
  hof, deap_population, best_prompt_overall
993
1211
  )
994
1212
 
995
1213
  # ---------- run one generation --------------------------------
996
1214
  deap_population, invalid_count = self._run_generation(
997
- generation_idx, deap_population, prompt, hof, report_evolutionary_algo, best_primary_score_overall
1215
+ generation_idx,
1216
+ deap_population,
1217
+ prompt,
1218
+ hof,
1219
+ report_evolutionary_algo,
1220
+ best_primary_score_overall,
998
1221
  )
999
1222
 
1000
1223
  # -------- update best-prompt bookkeeping -------------------------
1001
1224
  previous_best_primary_score_for_gen = best_primary_score_overall
1002
1225
  if hof:
1003
1226
  if self.enable_moo:
1004
- current_best_ind = max(hof, key=lambda ind: ind.fitness.values[0])
1227
+ current_best_ind = max(
1228
+ hof, key=lambda ind: ind.fitness.values[0]
1229
+ )
1005
1230
  else:
1006
1231
  current_best_ind = hof[0]
1007
1232
 
@@ -1009,7 +1234,10 @@ Return only the new prompt list object.
1009
1234
  if updated_best_primary_score > best_primary_score_overall:
1010
1235
  best_primary_score_overall = updated_best_primary_score
1011
1236
  self._generations_without_overall_improvement = 0
1012
- elif updated_best_primary_score == previous_best_primary_score_for_gen:
1237
+ elif (
1238
+ updated_best_primary_score
1239
+ == previous_best_primary_score_for_gen
1240
+ ):
1013
1241
  self._generations_without_overall_improvement += 1
1014
1242
  else:
1015
1243
  self._generations_without_overall_improvement += 1
@@ -1017,7 +1245,10 @@ Return only the new prompt list object.
1017
1245
  self._generations_without_overall_improvement += 1
1018
1246
 
1019
1247
  # ---------- early-stopping check ------------------------------
1020
- if self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS:
1248
+ if (
1249
+ self._generations_without_overall_improvement
1250
+ >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
1251
+ ):
1021
1252
  logger.info(
1022
1253
  "No overall improvement for %d generations – early stopping at gen %d.",
1023
1254
  self.DEFAULT_EARLY_STOPPING_GENERATIONS,
@@ -1029,114 +1260,168 @@ Return only the new prompt list object.
1029
1260
  # FIXME: Use model.dump() instead of dict()
1030
1261
  gen_round_data = OptimizationRound(
1031
1262
  round_number=generation_idx,
1032
- current_prompt=best_prompt_overall, # Representative best
1263
+ current_prompt=best_prompt_overall, # Representative best
1033
1264
  current_score=best_primary_score_overall,
1034
- generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
1265
+ generated_prompts=[
1266
+ {"prompt": str(ind), "score": ind.fitness.values[0]}
1267
+ for ind in deap_population
1268
+ if ind.fitness.valid
1269
+ ],
1035
1270
  best_prompt=best_prompt_overall,
1036
1271
  best_score=best_primary_score_overall,
1037
- improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
1038
- ).dict()
1272
+ improvement=(
1273
+ (best_primary_score_overall - initial_primary_score)
1274
+ / abs(initial_primary_score)
1275
+ if initial_primary_score and initial_primary_score != 0
1276
+ else (1.0 if best_primary_score_overall > 0 else 0.0)
1277
+ ),
1278
+ )
1039
1279
  self._add_to_history(gen_round_data)
1040
1280
 
1041
- stopped_early_flag = self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
1281
+ stopped_early_flag = (
1282
+ self._generations_without_overall_improvement
1283
+ >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
1284
+ )
1042
1285
  final_details = {}
1043
1286
  initial_score_for_display = initial_primary_score
1044
1287
 
1045
1288
  if self.enable_moo:
1046
1289
  final_results_log = "Pareto Front Solutions:\n"
1047
1290
  if hof and len(hof) > 0:
1048
- sorted_hof = sorted(hof, key=lambda ind: ind.fitness.values[0], reverse=True)
1291
+ sorted_hof = sorted(
1292
+ hof, key=lambda ind: ind.fitness.values[0], reverse=True
1293
+ )
1049
1294
  for i, sol in enumerate(sorted_hof):
1050
1295
  final_results_log += f" Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
1051
1296
  best_overall_solution = sorted_hof[0]
1052
- final_best_prompt = chat_prompt.ChatPrompt(messages=best_overall_solution)
1297
+ final_best_prompt = chat_prompt.ChatPrompt(
1298
+ messages=best_overall_solution
1299
+ )
1053
1300
  final_primary_score = best_overall_solution.fitness.values[0]
1054
1301
  final_length = best_overall_solution.fitness.values[1]
1055
1302
  logger.info(final_results_log)
1056
- logger.info(f"Representative best prompt (highest primary score from Pareto front): '{final_best_prompt}'")
1057
- logger.info(f" Primary Score ({metric.__name__}): {final_primary_score:.4f}")
1303
+ logger.info(
1304
+ f"Representative best prompt (highest primary score from Pareto front): '{final_best_prompt}'"
1305
+ )
1306
+ logger.info(
1307
+ f" Primary Score ({metric.__name__}): {final_primary_score:.4f}"
1308
+ )
1058
1309
  logger.info(f" Length: {final_length:.0f}")
1059
- final_details.update({
1060
- "initial_primary_score": initial_primary_score,
1061
- "initial_length": initial_length,
1062
- "final_prompt_representative": final_best_prompt,
1063
- "final_primary_score_representative": final_primary_score,
1064
- "final_length_representative": final_length,
1065
- "pareto_front_solutions": [
1066
- {"prompt": str(ind), "score": ind.fitness.values[0], "length": ind.fitness.values[1]}
1067
- for ind in hof
1068
- ] if hof else []
1069
- })
1310
+ final_details.update(
1311
+ {
1312
+ "initial_primary_score": initial_primary_score,
1313
+ "initial_length": initial_length,
1314
+ "final_prompt_representative": final_best_prompt,
1315
+ "final_primary_score_representative": final_primary_score,
1316
+ "final_length_representative": final_length,
1317
+ "pareto_front_solutions": (
1318
+ [
1319
+ {
1320
+ "prompt": str(ind),
1321
+ "score": ind.fitness.values[0],
1322
+ "length": ind.fitness.values[1],
1323
+ }
1324
+ for ind in hof
1325
+ ]
1326
+ if hof
1327
+ else []
1328
+ ),
1329
+ }
1330
+ )
1070
1331
  else:
1071
1332
  # MOO: ParetoFront is empty. Reporting last known best and fallback values
1072
1333
  logger.warning("MOO: ParetoFront is empty. Reporting last known best.")
1073
1334
  final_best_prompt = best_prompt_overall
1074
1335
  final_primary_score = best_primary_score_overall
1075
- final_length = float(len(json.dumps(final_best_prompt.formatted_messages)))
1076
- final_details.update({"initial_primary_score": initial_primary_score, "initial_length": initial_length,
1077
- "final_prompt_representative": final_best_prompt, "final_primary_score_representative": final_primary_score,
1078
- "final_length_representative": final_length, "pareto_front_solutions": []})
1336
+ final_length = float(len(json.dumps(final_best_prompt.get_messages())))
1337
+ final_details.update(
1338
+ {
1339
+ "initial_primary_score": initial_primary_score,
1340
+ "initial_length": initial_length,
1341
+ "final_prompt_representative": final_best_prompt,
1342
+ "final_primary_score_representative": final_primary_score,
1343
+ "final_length_representative": final_length,
1344
+ "pareto_front_solutions": [],
1345
+ }
1346
+ )
1079
1347
  else:
1080
1348
  # Single-objective
1081
1349
  final_best_prompt = best_prompt_overall
1082
1350
  final_primary_score = best_primary_score_overall
1083
1351
  logger.info(f"Final best prompt from Hall of Fame: '{final_best_prompt}'")
1084
- logger.info(f"Final best score ({metric.__name__}): {final_primary_score:.4f}")
1085
- final_details.update({
1086
- "initial_prompt": prompt.formatted_messages,
1087
- "initial_score": initial_primary_score,
1088
- "initial_score_for_display": initial_primary_score,
1089
- "final_prompt": final_best_prompt,
1090
- "final_score": final_primary_score,
1091
- })
1092
-
1352
+ logger.info(
1353
+ f"Final best score ({metric.__name__}): {final_primary_score:.4f}"
1354
+ )
1355
+ final_details.update(
1356
+ {
1357
+ "initial_prompt": prompt.get_messages(),
1358
+ "initial_score": initial_primary_score,
1359
+ "initial_score_for_display": initial_primary_score,
1360
+ "final_prompt": final_best_prompt,
1361
+ "final_score": final_primary_score,
1362
+ }
1363
+ )
1364
+
1093
1365
  logger.info(f"Total LLM calls during optimization: {self.llm_call_counter}")
1094
1366
  if opik_optimization_run:
1095
1367
  try:
1096
1368
  opik_optimization_run.update(status="completed")
1097
- logger.info(f"Opik Optimization run {self._current_optimization_id} status updated to completed.")
1369
+ logger.info(
1370
+ f"Opik Optimization run {self._current_optimization_id} status updated to completed."
1371
+ )
1098
1372
  except Exception as e:
1099
1373
  logger.warning(f"Failed to update Opik Optimization run status: {e}")
1100
1374
 
1101
1375
  # Add final details
1102
- final_details.update({
1103
- "total_generations_run": generation_idx + 1,
1104
- "population_size": self.population_size,
1105
- "mutation_probability": self.mutation_rate,
1106
- "crossover_probability": self.crossover_rate,
1107
- "elitism_size": self.elitism_size if not self.enable_moo else "N/A (MOO uses NSGA-II)",
1108
- "adaptive_mutation": self.adaptive_mutation,
1109
- "metric_name": metric.__name__,
1110
- "model": self.model,
1111
- "moo_enabled": self.enable_moo,
1112
- "llm_crossover_enabled": self.enable_llm_crossover,
1113
- "seed": self.seed,
1114
- "prompt_type": "single_string_ga",
1115
- "initial_score_for_display": initial_score_for_display,
1116
- "temperature": self.model_kwargs.get("temperature"),
1117
- "stopped_early": stopped_early_flag,
1118
- "rounds": self.get_history(),
1119
- "user_output_style_guidance": self.output_style_guidance,
1120
- "infer_output_style_requested": self.infer_output_style,
1121
- "final_effective_output_style_guidance": effective_output_style_guidance,
1122
- "infer_output_style": self.infer_output_style,
1123
- })
1376
+ final_details.update(
1377
+ {
1378
+ "total_generations_run": generation_idx + 1,
1379
+ "num_generations": self.num_generations,
1380
+ "population_size": self.population_size,
1381
+ "mutation_probability": self.mutation_rate,
1382
+ "crossover_probability": self.crossover_rate,
1383
+ "elitism_size": (
1384
+ self.elitism_size
1385
+ if not self.enable_moo
1386
+ else "N/A (MOO uses NSGA-II)"
1387
+ ),
1388
+ "adaptive_mutation": self.adaptive_mutation,
1389
+ "metric_name": metric.__name__,
1390
+ "model": self.model,
1391
+ "moo_enabled": self.enable_moo,
1392
+ "llm_crossover_enabled": self.enable_llm_crossover,
1393
+ "seed": self.seed,
1394
+ "prompt_type": "single_string_ga",
1395
+ "initial_score_for_display": initial_score_for_display,
1396
+ "temperature": self.model_kwargs.get("temperature"),
1397
+ "stopped_early": stopped_early_flag,
1398
+ "rounds": self.get_history(),
1399
+ "user_output_style_guidance": self.output_style_guidance,
1400
+ "infer_output_style_requested": self.infer_output_style,
1401
+ "final_effective_output_style_guidance": effective_output_style_guidance,
1402
+ "infer_output_style": self.infer_output_style,
1403
+ }
1404
+ )
1124
1405
 
1125
1406
  # Return the OptimizationResult
1126
1407
  reporting.display_result(
1127
1408
  initial_score=initial_score_for_display,
1128
1409
  best_score=final_primary_score,
1129
- best_prompt=final_best_prompt.formatted_messages,
1130
- verbose=self.verbose
1410
+ best_prompt=final_best_prompt.get_messages(),
1411
+ verbose=self.verbose,
1131
1412
  )
1132
1413
  return OptimizationResult(
1133
1414
  optimizer=self.__class__.__name__,
1134
- prompt=final_best_prompt.formatted_messages,
1135
- score=final_primary_score,
1415
+ prompt=final_best_prompt.get_messages(),
1416
+ score=final_primary_score,
1417
+ initial_prompt=prompt.get_messages(),
1418
+ initial_score=initial_primary_score,
1136
1419
  metric_name=metric.__name__,
1137
1420
  details=final_details,
1138
- history=self.get_history(),
1139
- llm_calls=self.llm_call_counter
1421
+ history=[x.model_dump() for x in self.get_history()],
1422
+ llm_calls=self.llm_call_counter,
1423
+ dataset_id=dataset.id,
1424
+ optimization_id=self._current_optimization_id,
1140
1425
  )
1141
1426
 
1142
1427
  @_throttle.rate_limited(_rate_limiter)
@@ -1158,7 +1443,7 @@ Return only the new prompt list object.
1158
1443
  }
1159
1444
 
1160
1445
  # Prepare metadata for opik
1161
- metadata_for_opik = {}
1446
+ metadata_for_opik: Dict[str, Any] = {}
1162
1447
  if self.project_name:
1163
1448
  metadata_for_opik["project_name"] = self.project_name
1164
1449
  metadata_for_opik["opik"] = {"project_name": self.project_name}
@@ -1168,7 +1453,9 @@ Return only the new prompt list object.
1168
1453
  metadata_for_opik["opik"]["optimization_id"] = optimization_id
1169
1454
 
1170
1455
  metadata_for_opik["optimizer_name"] = self.__class__.__name__
1171
- metadata_for_opik["opik_call_type"] = "reasoning" if is_reasoning else "evaluation_llm_task_direct"
1456
+ metadata_for_opik["opik_call_type"] = (
1457
+ "reasoning" if is_reasoning else "evaluation_llm_task_direct"
1458
+ )
1172
1459
 
1173
1460
  if metadata_for_opik:
1174
1461
  llm_config_params["metadata"] = metadata_for_opik
@@ -1186,6 +1473,7 @@ Return only the new prompt list object.
1186
1473
  response = litellm.completion(
1187
1474
  model=self.model, messages=messages, **final_call_params
1188
1475
  )
1476
+ self.llm_call_counter += 1
1189
1477
 
1190
1478
  logger.debug(f"Response: {response}")
1191
1479
  return response.choices[0].message.content
@@ -1199,12 +1487,15 @@ Return only the new prompt list object.
1199
1487
  logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
1200
1488
  raise
1201
1489
  except Exception as e:
1202
- logger.error(f"Error calling model '{self.model}': {type(e).__name__} - {e}")
1490
+ logger.error(
1491
+ f"Error calling model '{self.model}': {type(e).__name__} - {e}"
1492
+ )
1203
1493
  raise
1204
1494
 
1205
- def evaluate_prompt(
1495
+ def _evaluate_prompt(
1206
1496
  self,
1207
1497
  prompt: chat_prompt.ChatPrompt,
1498
+ messages: List[Dict[str, str]],
1208
1499
  dataset: opik.Dataset,
1209
1500
  metric: Callable,
1210
1501
  n_samples: Optional[int] = None,
@@ -1212,12 +1503,13 @@ Return only the new prompt list object.
1212
1503
  experiment_config: Optional[Dict] = None,
1213
1504
  optimization_id: Optional[str] = None,
1214
1505
  verbose: int = 0,
1506
+ **kwargs: Any,
1215
1507
  ) -> float:
1216
1508
  """
1217
1509
  Evaluate a single prompt (individual) against the dataset.
1218
-
1510
+
1219
1511
  Args:
1220
- prompt: The prompt to evaluate
1512
+ prompt:
1221
1513
  dataset: The dataset to use for evaluation
1222
1514
  metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
1223
1515
  n_samples: Optional number of samples to use
@@ -1225,46 +1517,44 @@ Return only the new prompt list object.
1225
1517
  experiment_config: Optional experiment configuration
1226
1518
  optimization_id: Optional optimization ID
1227
1519
  verbose: Controls internal logging/progress bars (0=off, 1=on).
1228
-
1520
+
1229
1521
  Returns:
1230
1522
  float: The metric value
1231
1523
  """
1232
1524
  total_items = len(dataset.get_items())
1233
-
1234
- current_experiment_config = experiment_config or {}
1235
- current_experiment_config = {
1236
- **current_experiment_config,
1237
- **{
1238
- "optimizer": self.__class__.__name__,
1239
- "metric": metric.__name__,
1240
- "dataset": dataset.name,
1241
- "configuration": {
1242
- "prompt": prompt.formatted_messages,
1243
- "n_samples_for_eval": len(dataset_item_ids) if dataset_item_ids is not None else n_samples,
1244
- "total_dataset_items": total_items,
1245
- },
1525
+
1526
+ experiment_config = experiment_config or {}
1527
+ experiment_config["project_name"] = self.agent_class.project_name
1528
+ experiment_config = {
1529
+ **experiment_config,
1530
+ "optimizer": self.__class__.__name__,
1531
+ "agent_class": self.agent_class.__name__,
1532
+ "agent_config": prompt.to_dict(),
1533
+ "metric": metric.__name__,
1534
+ "dataset": dataset.name,
1535
+ "configuration": {
1536
+ "prompt": prompt.get_messages(),
1537
+ "n_samples_for_eval": (
1538
+ len(dataset_item_ids) if dataset_item_ids is not None else n_samples
1539
+ ),
1540
+ "total_dataset_items": total_items,
1246
1541
  },
1247
1542
  }
1248
1543
 
1249
- def llm_task(
1250
- dataset_item: Dict[str, Any]
1251
- ) -> Dict[str, str]:
1252
- try:
1253
- messages = [{
1254
- "role": item["role"],
1255
- "content": item["content"].format(**dataset_item)
1256
- } for item in prompt.formatted_messages]
1257
- except Exception as e:
1258
- logger.warning(f"Error in llm_task, this is usually a parsing error: {e}")
1259
- return {mappers.EVALUATED_LLM_TASK_OUTPUT: ""}
1260
-
1261
- model_output = self._call_model(
1262
- messages=messages,
1263
- is_reasoning=False
1264
- )
1265
-
1544
+ new_prompt = prompt.copy()
1545
+ new_prompt.set_messages(messages)
1546
+ try:
1547
+ agent = self.agent_class(new_prompt)
1548
+ except Exception:
1549
+ return 0.0
1550
+
1551
+ def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
1552
+ # print("MESSAGES:", new_prompt.messages)
1553
+ messages = new_prompt.get_messages(dataset_item)
1554
+ model_output = agent.invoke(messages)
1555
+ # print("OUTPUT:", model_output)
1266
1556
  return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
1267
-
1557
+
1268
1558
  # Evaluate the prompt
1269
1559
  score = task_evaluator.evaluate(
1270
1560
  dataset=dataset,
@@ -1272,24 +1562,22 @@ Return only the new prompt list object.
1272
1562
  metric=metric,
1273
1563
  evaluated_task=llm_task,
1274
1564
  num_threads=self.num_threads,
1275
- project_name=self.project_name,
1565
+ project_name=experiment_config["project_name"],
1276
1566
  n_samples=n_samples if dataset_item_ids is None else None,
1277
- experiment_config=current_experiment_config,
1567
+ experiment_config=experiment_config,
1278
1568
  optimization_id=optimization_id,
1279
- verbose=verbose
1569
+ verbose=verbose,
1280
1570
  )
1281
1571
  return score
1282
1572
 
1283
- def _llm_deap_crossover(
1284
- self,
1285
- ind1: "creator.Individual",
1286
- ind2: "creator.Individual"
1287
- ) -> Tuple["creator.Individual", "creator.Individual"]:
1573
+ def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
1288
1574
  """Perform crossover by asking an LLM to blend two parent prompts."""
1289
- reporting.display_message(" Recombining prompts using an LLM.", verbose=self.verbose)
1575
+ reporting.display_message(
1576
+ " Recombining prompts using an LLM.", verbose=self.verbose
1577
+ )
1290
1578
 
1291
- parent1_messages: List[Dict[Literal["role", "content"], str]] = ind1
1292
- parent2_messages: List[Dict[Literal["role", "content"], str]] = ind2
1579
+ parent1_messages: List[Dict[str, str]] = ind1
1580
+ parent2_messages: List[Dict[str, str]] = ind2
1293
1581
  current_output_style_guidance = self.output_style_guidance
1294
1582
 
1295
1583
  user_prompt_for_llm_crossover = f"""Parent Prompt 1:
@@ -1308,38 +1596,50 @@ Follow the instructions provided in the system prompt regarding the JSON output
1308
1596
  ]
1309
1597
  """
1310
1598
  try:
1311
- logger.debug(f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'")
1599
+ logger.debug(
1600
+ f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'"
1601
+ )
1312
1602
  response_content = self._call_model(
1313
1603
  messages=[
1314
- {"role": "system", "content": self.get_llm_crossover_system_prompt()},
1604
+ {
1605
+ "role": "system",
1606
+ "content": self.get_llm_crossover_system_prompt(),
1607
+ },
1315
1608
  {"role": "user", "content": user_prompt_for_llm_crossover},
1316
1609
  ],
1317
- is_reasoning=True
1610
+ is_reasoning=True,
1318
1611
  )
1319
1612
  logger.debug(f"Raw LLM response for crossover: {response_content}")
1320
1613
 
1321
1614
  json_response = utils.json_to_dict(response_content)
1322
- if not isinstance(json_response, list) or len(json_response) != 2 or not all(isinstance(cs, list) for cs in json_response):
1323
- logger.warning("LLM Crossover: Malformed or empty children_prompts list. Falling back.")
1615
+ if (
1616
+ not isinstance(json_response, list)
1617
+ or len(json_response) != 2
1618
+ or not all(isinstance(cs, list) for cs in json_response)
1619
+ ):
1620
+ logger.warning(
1621
+ "LLM Crossover: Malformed or empty children_prompts list. Falling back."
1622
+ )
1324
1623
  raise ValueError("Malformed LLM crossover response")
1325
1624
 
1326
- child1: List[Dict[Literal["role", "content"], str]] = json_response[0]
1327
- child2: List[Dict[Literal["role", "content"], str]] = json_response[1]
1328
-
1329
- logger.debug(f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}...")
1330
- return creator.Individual(child1), creator.Individual(child2)
1625
+ child1: List[Dict[str, str]] = json_response[0]
1626
+ child2: List[Dict[str, str]] = json_response[1]
1627
+
1628
+ logger.debug(
1629
+ f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}..."
1630
+ )
1631
+ return creator.Individual(child1), creator.Individual(child2)
1331
1632
 
1332
1633
  except Exception as e:
1333
- logger.warning(f"LLM-driven crossover failed: {e}. Falling back to standard crossover.")
1634
+ logger.warning(
1635
+ f"LLM-driven crossover failed: {e}. Falling back to standard crossover."
1636
+ )
1334
1637
  return self._deap_crossover(ind1, ind2)
1335
1638
 
1336
- def _get_task_description_for_llm(
1337
- self,
1338
- prompt: chat_prompt.ChatPrompt
1339
- ) -> str:
1639
+ def _get_task_description_for_llm(self, prompt: chat_prompt.ChatPrompt) -> str:
1340
1640
  """Generates a concise task description for use in LLM prompts for fresh generation or radical innovation."""
1341
1641
  description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
1342
- description += f"The original high-level instruction being optimized is: '{prompt.formatted_messages}'. "
1642
+ description += f"The original high-level instruction being optimized is: '{prompt.get_messages()}'. "
1343
1643
  description += "The goal is to create an effective prompt that guides a language model to perform this task well."
1344
1644
  return description
1345
1645
 
@@ -1375,8 +1675,8 @@ Each prompt variation should aim to get the target LLM to produce answers matchi
1375
1675
  """
1376
1676
 
1377
1677
  def get_llm_crossover_system_prompt(self) -> str:
1378
- return f"""You are an expert prompt engineer specializing in creating novel prompts by intelligently blending existing ones.
1379
- Given two parent prompts, your task is to generate one or two new child prompts that effectively combine the strengths, styles, or core ideas of both parents.
1678
+ return f"""You are an expert prompt engineer specializing in creating novel prompts by intelligently blending existing ones.
1679
+ Given two parent prompts, your task is to generate one or two new child prompts that effectively combine the strengths, styles, or core ideas of both parents.
1380
1680
  The children should be coherent and aim to explore a potentially more effective region of the prompt design space, with a key goal of eliciting responses from the target language model in the following style: '{self.output_style_guidance}'.
1381
1681
 
1382
1682
  Consider the following when generating children:
@@ -1397,69 +1697,80 @@ Return a JSON object that is a list of both child prompts. Each child prompt is
1397
1697
  """
1398
1698
 
1399
1699
  def _get_radical_innovation_system_prompt(self) -> str:
1400
- return f"""You are an expert prompt engineer and a creative problem solver.
1401
- Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
1402
- Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
1700
+ return f"""You are an expert prompt engineer and a creative problem solver.
1701
+ Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
1702
+ Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
1403
1703
  Consider clarity, specificity, constraints, and how to best guide the language model for the described task TO PRODUCE OUTPUTS IN THE FOLLOWING STYLE: '{self.output_style_guidance}'.
1404
1704
  Return only the new prompt string, with no preamble or explanation.
1405
1705
  """
1406
1706
 
1407
1707
  def _infer_output_style_from_dataset(
1408
- self,
1409
- dataset: opik.Dataset,
1410
- prompt: chat_prompt.ChatPrompt,
1411
- n_examples: int = 5
1412
- ) -> Optional[str]:
1708
+ self, dataset: opik.Dataset, prompt: chat_prompt.ChatPrompt, n_examples: int = 5
1709
+ ) -> Optional[str]:
1413
1710
  """Analyzes dataset examples to infer the desired output style."""
1414
- with reporting.infer_output_style(verbose=self.verbose) as report_infer_output_style:
1415
- report_infer_output_style.start_style_inference(n_examples)
1416
-
1711
+ with reporting.infer_output_style(
1712
+ verbose=self.verbose
1713
+ ) as report_infer_output_style:
1714
+ report_infer_output_style.start_style_inference()
1715
+
1417
1716
  try:
1418
1717
  items_to_process = dataset.get_items(n_examples)
1419
1718
  except Exception as e:
1420
- report_infer_output_style.error(f"Failed to get items from dataset '{dataset.name}': {e}")
1719
+ report_infer_output_style.error(
1720
+ f"Failed to get items from dataset '{dataset.name}': {e}"
1721
+ )
1421
1722
  return None
1422
1723
 
1423
1724
  if not items_to_process:
1424
- report_infer_output_style.error(f"Dataset '{dataset.name}' is empty. Cannot infer output style.")
1725
+ report_infer_output_style.error(
1726
+ f"Dataset '{dataset.name}' is empty. Cannot infer output style."
1727
+ )
1425
1728
  return None
1426
1729
 
1427
1730
  # Need at least a couple of examples for meaningful inference
1428
1731
  if len(items_to_process) < min(n_examples, 2):
1429
- report_infer_output_style.error(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
1732
+ report_infer_output_style.error(
1733
+ f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}."
1734
+ )
1430
1735
  return None
1431
1736
 
1432
1737
  examples_str = ""
1433
1738
  for i, item_content in enumerate(items_to_process):
1434
1739
  filtered_content = {x: y for x, y in item_content.items() if x != "id"}
1435
- examples_str += f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
1740
+ examples_str += (
1741
+ f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
1742
+ )
1436
1743
 
1437
1744
  user_prompt_for_style_inference = f"""Please analyze the following examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. Before describing the output style, make sure to understand the dataset content and structure as it can include input, output and metadata fields. This description will be used to guide other LLMs in generating and refining prompts.
1438
1745
 
1439
1746
  {examples_str}
1440
1747
 
1441
- Based on these examples, what is the desired output style description?
1442
- Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
1748
+ Based on these examples, what is the desired output style description?
1749
+ Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
1443
1750
  The description should be a single string that can be directly used as an instruction for another LLM.
1444
1751
  Return ONLY this descriptive string.
1445
1752
  """
1446
- #report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
1753
+ # report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
1447
1754
 
1448
1755
  try:
1449
1756
  inferred_style = self._call_model(
1450
1757
  messages=[
1451
1758
  {"role": "system", "content": self._INFER_STYLE_SYSTEM_PROMPT},
1452
- {"role": "user", "content": user_prompt_for_style_inference}
1759
+ {"role": "user", "content": user_prompt_for_style_inference},
1453
1760
  ],
1454
- is_reasoning=True
1761
+ is_reasoning=True,
1455
1762
  )
1456
1763
  inferred_style = inferred_style.strip()
1457
1764
  if inferred_style:
1458
1765
  report_infer_output_style.success(inferred_style)
1459
1766
  return inferred_style
1460
1767
  else:
1461
- report_infer_output_style.error("LLM returned empty string for inferred output style.")
1768
+ report_infer_output_style.error(
1769
+ "LLM returned empty string for inferred output style."
1770
+ )
1462
1771
  return None
1463
1772
  except Exception as e:
1464
- report_infer_output_style.error(f"Error during output style inference: {e}")
1773
+ report_infer_output_style.error(
1774
+ f"Error during output style inference: {e}"
1775
+ )
1465
1776
  return None