opik-optimizer 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1447 @@
1
+ from typing import Optional, Union, List, Dict, Any, Tuple
2
+ import opik
3
+ import logging
4
+ import random
5
+ import json
6
+ from string import Template
7
+ import os
8
+ import time
9
+ import Levenshtein
10
+ import numpy as np
11
+
12
+ from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
13
+ from opik_optimizer.optimization_config.configs import TaskConfig, MetricConfig
14
+ from opik_optimizer.optimization_result import OptimizationResult
15
+ from opik_optimizer import task_evaluator
16
+ from opik_optimizer.optimization_config import mappers
17
+ from opik.api_objects import opik_client
18
+ from opik.environment import get_tqdm_for_current_environment
19
+ from opik_optimizer import _throttle
20
+ import litellm
21
+ from litellm.caching import Cache
22
+ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
23
+
24
+ # DEAP imports
25
+ from deap import base, creator, tools, algorithms
26
+
27
+ logger = logging.getLogger(__name__)
28
+ tqdm = get_tqdm_for_current_environment()
29
+ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
30
+
31
+ # Using disk cache for LLM calls
32
+ disk_cache_dir = os.path.expanduser("~/.litellm_cache")
33
+ litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
34
+
35
+ class EvolutionaryOptimizer(BaseOptimizer):
36
+ """
37
+ Optimizes prompts using a genetic algorithm approach.
38
+ Focuses on evolving the prompt text itself.
39
+ Can operate in single-objective or multi-objective mode.
40
+ """
41
+
42
+ DEFAULT_POPULATION_SIZE = 30
43
+ DEFAULT_NUM_GENERATIONS = 15
44
+ DEFAULT_MUTATION_RATE = 0.2
45
+ DEFAULT_CROSSOVER_RATE = 0.8
46
+ DEFAULT_TOURNAMENT_SIZE = 4
47
+ DEFAULT_NUM_THREADS = 12
48
+ DEFAULT_HALL_OF_FAME_SIZE = 10
49
+ DEFAULT_ELITISM_SIZE = 3
50
+ DEFAULT_MIN_MUTATION_RATE = 0.1
51
+ DEFAULT_MAX_MUTATION_RATE = 0.4
52
+ DEFAULT_ADAPTIVE_MUTATION = True
53
+ DEFAULT_DIVERSITY_THRESHOLD = 0.7
54
+ DEFAULT_RESTART_THRESHOLD = 0.01
55
+ DEFAULT_RESTART_GENERATIONS = 3
56
+ DEFAULT_CACHE_SIZE = 1000
57
+ DEFAULT_EARLY_STOPPING_GENERATIONS = 5
58
+ DEFAULT_ENABLE_MOO = True
59
+ DEFAULT_ENABLE_LLM_CROSSOVER = True
60
+ DEFAULT_SEED = 42
61
+ DEFAULT_OUTPUT_STYLE_GUIDANCE = "Produce clear, effective, and high-quality responses suitable for the task."
62
+ DEFAULT_MOO_WEIGHTS = (1.0, -1.0) # (Maximize Score, Minimize Length)
63
+
64
+ _INFER_STYLE_SYSTEM_PROMPT = """You are an expert in linguistic analysis and prompt engineering. Your task is to analyze a few input-output examples from a dataset and provide a concise, actionable description of the desired output style. This description will be used to guide other LLMs in generating and refining prompts.
65
+
66
+ Focus on characteristics like:
67
+ - **Length**: (e.g., single word, short phrase, one sentence, multiple sentences, a paragraph)
68
+ - **Tone**: (e.g., factual, formal, informal, conversational, academic)
69
+ - **Structure**: (e.g., direct answer first, explanation then answer, list, yes/no then explanation)
70
+ - **Content Details**: (e.g., includes only the answer, includes reasoning, provides examples, avoids pleasantries)
71
+ - **Keywords/Phrasing**: Any recurring keywords or phrasing patterns in the outputs.
72
+
73
+ Provide a single string that summarizes this style. This summary should be directly usable as an instruction for another LLM.
74
+ For example: 'Outputs should be a single, concise proper noun.' OR 'Outputs should be a short paragraph explaining the reasoning, followed by a direct answer, avoiding conversational pleasantries.' OR 'Outputs are typically 1-2 sentences, providing a direct factual answer.'
75
+ Return ONLY this descriptive string, with no preamble or extra formatting.
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ model: str,
81
+ project_name: Optional[str] = None,
82
+ population_size: int = DEFAULT_POPULATION_SIZE,
83
+ num_generations: int = DEFAULT_NUM_GENERATIONS,
84
+ mutation_rate: float = DEFAULT_MUTATION_RATE,
85
+ crossover_rate: float = DEFAULT_CROSSOVER_RATE,
86
+ tournament_size: int = DEFAULT_TOURNAMENT_SIZE,
87
+ num_threads: int = DEFAULT_NUM_THREADS,
88
+ elitism_size: int = DEFAULT_ELITISM_SIZE,
89
+ adaptive_mutation: bool = DEFAULT_ADAPTIVE_MUTATION,
90
+ enable_moo: bool = DEFAULT_ENABLE_MOO,
91
+ enable_llm_crossover: bool = DEFAULT_ENABLE_LLM_CROSSOVER,
92
+ seed: Optional[int] = DEFAULT_SEED,
93
+ output_style_guidance: Optional[str] = None,
94
+ infer_output_style: bool = False,
95
+ verbose: int = 1,
96
+ **model_kwargs,
97
+ ):
98
+ # FIXME: Hack for verbose till its merged
99
+ self.verbose = 1
100
+
101
+ # Initialize base class first
102
+ super().__init__(model=model, project_name=project_name, **model_kwargs)
103
+ self.population_size = population_size
104
+ self.num_generations = num_generations
105
+ self.mutation_rate = mutation_rate
106
+ self.crossover_rate = crossover_rate
107
+ self.tournament_size = tournament_size
108
+ self.num_threads = num_threads
109
+ self.elitism_size = elitism_size
110
+ self.adaptive_mutation = adaptive_mutation
111
+ self.enable_moo = enable_moo
112
+ self.enable_llm_crossover = enable_llm_crossover
113
+ self.seed = seed
114
+ self.output_style_guidance = output_style_guidance if output_style_guidance is not None else self.DEFAULT_OUTPUT_STYLE_GUIDANCE
115
+ self.infer_output_style = infer_output_style
116
+ self.llm_call_counter = 0
117
+ self._opik_client = opik_client.get_client_cached()
118
+ self._current_optimization_id = None
119
+ self._current_generation = 0
120
+ self._best_fitness_history = []
121
+ self._generations_without_improvement = 0
122
+ self._llm_cache = {}
123
+ self._current_population = []
124
+ self._generations_without_overall_improvement = 0
125
+
126
+ if self.seed is not None:
127
+ random.seed(self.seed)
128
+ np.random.seed(self.seed)
129
+ logger.info(f"Global random seed set to: {self.seed}")
130
+ # Note: DEAP tools generally respect random.seed().
131
+ # TODO investigate if specific DEAP components require separate seeding
132
+
133
+ if self.enable_moo:
134
+ if not hasattr(creator, "FitnessMulti"):
135
+ creator.create("FitnessMulti", base.Fitness, weights=self.DEFAULT_MOO_WEIGHTS)
136
+ fitness_attr = creator.FitnessMulti
137
+ else:
138
+ if not hasattr(creator, "FitnessMax"):
139
+ creator.create("FitnessMax", base.Fitness, weights=(1.0,))
140
+ fitness_attr = creator.FitnessMax
141
+
142
+ if not hasattr(creator, "Individual") or getattr(creator.Individual, "fitness") != fitness_attr:
143
+ if hasattr(creator, "Individual"):
144
+ del creator.Individual
145
+ creator.create("Individual", str, fitness=fitness_attr)
146
+
147
+ self.toolbox = base.Toolbox()
148
+ self.toolbox.register("default_individual", lambda: creator.Individual("placeholder"))
149
+ self.toolbox.register("population", tools.initRepeat, list, self.toolbox.default_individual)
150
+
151
+ if self.enable_llm_crossover:
152
+ self.toolbox.register("mate", self._llm_deap_crossover)
153
+ else:
154
+ self.toolbox.register("mate", self._deap_crossover)
155
+
156
+ self.toolbox.register("mutate", self._deap_mutation)
157
+
158
+ if self.enable_moo:
159
+ self.toolbox.register("select", tools.selNSGA2)
160
+ else:
161
+ self.toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)
162
+
163
+ logger.debug(
164
+ f"Initialized EvolutionaryOptimizer with model: {model}, MOO_enabled: {self.enable_moo}, "
165
+ f"LLM_Crossover: {self.enable_llm_crossover}, Seed: {self.seed}, "
166
+ f"OutputStyleGuidance: '{self.output_style_guidance[:50]}...', "
167
+ f"population_size: {self.population_size}, num_generations: {self.num_generations}, "
168
+ f"mutation_rate: {self.mutation_rate}, crossover_rate: {self.crossover_rate}"
169
+ )
170
+
171
+ def _get_adaptive_mutation_rate(self) -> float:
172
+ """Calculate adaptive mutation rate based on population diversity and progress."""
173
+ if not self.adaptive_mutation or len(self._best_fitness_history) < 2:
174
+ return self.mutation_rate
175
+
176
+ # Calculate improvement rate
177
+ recent_improvement = (self._best_fitness_history[-1] - self._best_fitness_history[-2]) / abs(self._best_fitness_history[-2])
178
+
179
+ # Calculate population diversity
180
+ current_diversity = self._calculate_population_diversity()
181
+
182
+ # Check for stagnation
183
+ if recent_improvement < self.DEFAULT_RESTART_THRESHOLD:
184
+ self._generations_without_improvement += 1
185
+ else:
186
+ self._generations_without_improvement = 0
187
+
188
+ # Adjust mutation rate based on both improvement and diversity
189
+ if self._generations_without_improvement >= self.DEFAULT_RESTART_GENERATIONS:
190
+ # Significant stagnation - increase mutation significantly
191
+ return min(self.mutation_rate * 2.5, self.DEFAULT_MAX_MUTATION_RATE)
192
+ elif recent_improvement < 0.01 and current_diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
193
+ # Both stagnating and low diversity - increase mutation significantly
194
+ return min(self.mutation_rate * 2.0, self.DEFAULT_MAX_MUTATION_RATE)
195
+ elif recent_improvement < 0.01:
196
+ # Stagnating but good diversity - moderate increase
197
+ return min(self.mutation_rate * 1.5, self.DEFAULT_MAX_MUTATION_RATE)
198
+ elif recent_improvement > 0.05:
199
+ # Good progress - decrease mutation
200
+ return max(self.mutation_rate * 0.8, self.DEFAULT_MIN_MUTATION_RATE)
201
+ return self.mutation_rate
202
+
203
+ def _calculate_population_diversity(self) -> float:
204
+ """Calculate the diversity of the current population."""
205
+ if not hasattr(self, '_current_population') or not self._current_population:
206
+ return 0.0
207
+
208
+ # Calculate average Levenshtein distance between all pairs
209
+ total_distance = 0
210
+ count = 0
211
+ for i in range(len(self._current_population)):
212
+ for j in range(i + 1, len(self._current_population)):
213
+ str1 = str(self._current_population[i])
214
+ str2 = str(self._current_population[j])
215
+ distance = Levenshtein.distance(str1, str2)
216
+ max_len = max(len(str1), len(str2))
217
+ if max_len > 0:
218
+ normalized_distance = distance / max_len
219
+ total_distance += normalized_distance
220
+ count += 1
221
+
222
+ return total_distance / count if count > 0 else 0.0
223
+
224
+ def _deap_crossover(
225
+ self,
226
+ ind1: "creator.Individual",
227
+ ind2: "creator.Individual"
228
+ ) -> Tuple["creator.Individual", "creator.Individual"]:
229
+ """Enhanced crossover operation that preserves semantic meaning.
230
+ Attempts chunk-level crossover first, then falls back to word-level.
231
+ """
232
+ str1_orig, str2_orig = str(ind1), str(ind2)
233
+
234
+ chunks1 = [chunk.strip() for chunk in str1_orig.split('.') if chunk.strip()]
235
+ chunks2 = [chunk.strip() for chunk in str2_orig.split('.') if chunk.strip()]
236
+
237
+ # Try chunk-level crossover if both parents have at least 2 chunks
238
+ if len(chunks1) >= 2 and len(chunks2) >= 2:
239
+ min_num_chunks = min(len(chunks1), len(chunks2))
240
+ # Crossover point is between 1 and min_num_chunks - 1
241
+ # This requires min_num_chunks >= 2, which is already checked.
242
+ point = random.randint(1, min_num_chunks - 1)
243
+
244
+ child1_chunks = chunks1[:point] + chunks2[point:]
245
+ child2_chunks = chunks2[:point] + chunks1[point:]
246
+
247
+ child1_str = '. '.join(child1_chunks) + ('.' if child1_chunks else '')
248
+ child2_str = '. '.join(child2_chunks) + ('.' if child2_chunks else '')
249
+
250
+ return creator.Individual(child1_str), creator.Individual(child2_str)
251
+
252
+ # Fallback to word-level crossover if chunk-level is not suitable
253
+ words1 = str1_orig.split()
254
+ words2 = str2_orig.split()
255
+
256
+ # If either prompt is empty (no words), return parents
257
+ if not words1 or not words2:
258
+ return ind1, ind2
259
+
260
+ min_word_len = min(len(words1), len(words2))
261
+ # Need at least 2 words in the shorter prompt for a valid crossover point
262
+ if min_word_len < 2:
263
+ return ind1, ind2
264
+
265
+ # Crossover point for words: 1 to min_word_len - 1
266
+ point = random.randint(1, min_word_len - 1)
267
+ child1_words = words1[:point] + words2[point:]
268
+ child2_words = words2[:point] + words1[point:]
269
+
270
+ return creator.Individual(' '.join(child1_words)), creator.Individual(' '.join(child2_words))
271
+
272
+ def _deap_mutation(
273
+ self,
274
+ individual: "creator.Individual",
275
+ task_config: TaskConfig
276
+ ) -> Tuple["creator.Individual",]:
277
+ """Enhanced mutation operation with multiple strategies. Requires task_config for some mutations."""
278
+ prompt = str(individual)
279
+
280
+ # Choose mutation strategy based on current diversity
281
+ diversity = self._calculate_population_diversity()
282
+
283
+ # Determine thresholds based on diversity
284
+ if diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
285
+ # Low diversity - use more aggressive mutations (higher chance for semantic)
286
+ semantic_threshold = 0.5
287
+ structural_threshold = 0.8 # semantic_threshold + 0.3
288
+ else:
289
+ # Good diversity - use more conservative mutations (higher chance for word_level)
290
+ semantic_threshold = 0.4
291
+ structural_threshold = 0.7 # semantic_threshold + 0.3
292
+
293
+ mutation_choice = random.random()
294
+
295
+ if mutation_choice > structural_threshold:
296
+ # This corresponds to the original 'else' (word_level_mutation)
297
+ return self._word_level_mutation(prompt)
298
+ elif mutation_choice > semantic_threshold:
299
+ # This corresponds to the original 'elif' (structural_mutation)
300
+ return self._structural_mutation(prompt)
301
+ else:
302
+ # This corresponds to the original 'if' (semantic_mutation)
303
+ return self._semantic_mutation(prompt, task_config)
304
+
305
+ def _semantic_mutation(
306
+ self,
307
+ prompt: str,
308
+ task_config: TaskConfig
309
+ ) -> Tuple["creator.Individual",]:
310
+ """Enhanced semantic mutation with multiple strategies."""
311
+ current_output_style_guidance = self.output_style_guidance
312
+ if random.random() < 0.1:
313
+ return self._radical_innovation_mutation(prompt, task_config)
314
+
315
+ try:
316
+ strategy = random.choice([
317
+ "rephrase", "simplify", "elaborate", "restructure", "focus", "increase_complexity_and_detail"
318
+ ])
319
+
320
+ strategy_prompts = {
321
+ "rephrase": f"Create a different way to express the same instruction, possibly with a different length or structure, ensuring it still aims for an answer from the target LLM in the style of: '{current_output_style_guidance}'.",
322
+ "simplify": f"Simplify the instruction while maintaining its core meaning, potentially making it more concise, to elicit an answer in the style of: '{current_output_style_guidance}'.",
323
+ "elaborate": f"Add more relevant detail and specificity to the instruction, potentially increasing its length, but only if it helps achieve a more accurate answer from the target LLM in the style of: '{current_output_style_guidance}'.",
324
+ "restructure": f"Change the structure of the instruction (e.g., reorder sentences, combine/split ideas) while keeping its intent, ensuring the new structure strongly guides towards an output in the style of: '{current_output_style_guidance}'.",
325
+ "focus": f"Emphasize the key aspects of the instruction, perhaps by rephrasing or adding clarifying statements, to better elicit an answer in the style of: '{current_output_style_guidance}'.",
326
+ "increase_complexity_and_detail": f"Significantly elaborate on this instruction. Add more details, examples, context, or constraints to make it more comprehensive. The goal of this elaboration is to make the prompt itself more detailed, so that it VERY CLEARLY guides the target LLM to produce a highly accurate final answer in the style of: '{current_output_style_guidance}'. The prompt can be long if needed to achieve this output style."
327
+ }
328
+
329
+ user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
330
+ Task context: {self._get_task_description_for_llm(task_config)}
331
+ Desired output style from target LLM: '{current_output_style_guidance}'
332
+ Instruction for this modification: {strategy_prompts[strategy]}.
333
+ Return only the modified prompt string, nothing else.
334
+ """
335
+ response = self._call_model(
336
+ prompt=user_prompt_for_semantic_mutation,
337
+ system_prompt=f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided.",
338
+ is_reasoning=True
339
+ )
340
+ return creator.Individual(response.strip()),
341
+ except Exception as e:
342
+ logger.warning(f"Error in semantic mutation for prompt '{prompt[:50]}...': {e}")
343
+ return creator.Individual(prompt),
344
+
345
+ def _structural_mutation(
346
+ self,
347
+ prompt: str
348
+ ) -> Tuple["creator.Individual",]:
349
+ """Perform structural mutation (reordering, combining, splitting)."""
350
+ sentences = [s.strip() for s in prompt.split('.') if s.strip()]
351
+ if len(sentences) <= 1:
352
+ return self._word_level_mutation(prompt)
353
+
354
+ mutation_type = random.random()
355
+ if mutation_type < 0.3:
356
+ # Reorder sentences
357
+ random.shuffle(sentences)
358
+ return creator.Individual('. '.join(sentences) + '.'),
359
+ elif mutation_type < 0.6:
360
+ # Combine adjacent sentences
361
+ if len(sentences) >= 2:
362
+ idx = random.randint(0, len(sentences) - 2)
363
+ combined = sentences[idx] + ' and ' + sentences[idx + 1]
364
+ sentences[idx:idx+2] = [combined]
365
+ return creator.Individual('. '.join(sentences) + '.'),
366
+ else:
367
+ # Split a sentence
368
+ idx = random.randint(0, len(sentences) - 1)
369
+ words = sentences[idx].split()
370
+ if len(words) > 3:
371
+ split_point = random.randint(2, len(words) - 2)
372
+ sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
373
+ return creator.Individual('. '.join(sentences) + '.'),
374
+
375
+ return creator.Individual(prompt),
376
+
377
+ def _word_level_mutation(self, prompt: str) -> Tuple["creator.Individual",]:
378
+ """Perform word-level mutation."""
379
+ words = prompt.split()
380
+ if len(words) <= 1:
381
+ return creator.Individual(prompt),
382
+
383
+ mutation_type = random.random()
384
+ if mutation_type < 0.3:
385
+ # Word replacement
386
+ idx = random.randint(0, len(words) - 1)
387
+ words[idx] = self._get_synonym(words[idx])
388
+ elif mutation_type < 0.6:
389
+ # Word reordering
390
+ if len(words) > 2:
391
+ i, j = random.sample(range(len(words)), 2)
392
+ words[i], words[j] = words[j], words[i]
393
+ else:
394
+ # Phrase modification
395
+ idx = random.randint(0, len(words) - 1)
396
+ words[idx] = self._modify_phrase(words[idx])
397
+
398
+ return creator.Individual(' '.join(words)),
399
+
400
+ def _get_synonym(
401
+ self,
402
+ word: str
403
+ ) -> str:
404
+ """Get a synonym for a word using LLM."""
405
+ try:
406
+ response = self._call_model(
407
+ prompt=f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else.",
408
+ system_prompt="You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text.",
409
+ is_reasoning=True
410
+ )
411
+ return response.strip()
412
+ except Exception as e:
413
+ logger.warning(f"Error getting synonym for '{word}': {e}")
414
+ return word
415
+
416
+ def _modify_phrase(
417
+ self,
418
+ phrase: str
419
+ ) -> str:
420
+ """Modify a phrase while preserving meaning using LLM."""
421
+ try:
422
+ response = self._call_model(
423
+ prompt=f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else.",
424
+ system_prompt="You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text.",
425
+ is_reasoning=True
426
+ )
427
+ return response.strip()
428
+ except Exception as e:
429
+ logger.warning(f"Error modifying phrase '{phrase}': {e}")
430
+ return phrase
431
+
432
+ def _radical_innovation_mutation(
433
+ self,
434
+ prompt_str: str,
435
+ task_config: TaskConfig
436
+ ) -> Tuple["creator.Individual",]:
437
+ """Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
438
+ logger.debug(f"Attempting radical innovation for prompt: {prompt_str[:70]}...")
439
+ task_desc_for_llm = self._get_task_description_for_llm(task_config)
440
+ current_output_style_guidance = self.output_style_guidance
441
+
442
+ user_prompt_for_radical_innovation = f"""Task Context:
443
+ {task_desc_for_llm}
444
+ Desired output style from target LLM: '{current_output_style_guidance}'
445
+
446
+ Existing Prompt (which may be underperforming):
447
+ '''{prompt_str}'''
448
+
449
+ Please generate a new, significantly improved, and potentially very different prompt for this task.
450
+ Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
451
+ Return only the new prompt string.
452
+ """
453
+ try:
454
+ new_prompt_str = self._call_model(
455
+ prompt=user_prompt_for_radical_innovation,
456
+ system_prompt=self.get_radical_innovation_system_prompt(),
457
+ is_reasoning=True
458
+ )
459
+ logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {prompt_str[:70]}...")
460
+ return creator.Individual(new_prompt_str.strip()),
461
+ except Exception as e:
462
+ logger.warning(f"Radical innovation mutation failed for prompt '{prompt_str[:50]}...': {e}. Returning original.")
463
+ return creator.Individual(prompt_str),
464
+
465
+ def _initialize_population(
466
+ self,
467
+ initial_prompt: str,
468
+ task_config: TaskConfig,
469
+ ) -> List[str]:
470
+ """Initialize the population with diverse variations of the initial prompt,
471
+ including some 'fresh start' prompts based purely on task description.
472
+ All generated prompts should aim to elicit answers matching self.output_style_guidance.
473
+ """
474
+ population = [initial_prompt]
475
+ if self.population_size <= 1:
476
+ return population
477
+
478
+ num_to_generate_total = self.population_size - 1
479
+ num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
480
+ num_variations_on_initial = num_to_generate_total - num_fresh_starts
481
+
482
+ task_desc_for_llm = self._get_task_description_for_llm(task_config)
483
+ current_output_style_guidance = self.output_style_guidance
484
+
485
+ # Generate "fresh start" prompts if the initial prompt is not performing well
486
+ # Cold start prompts are generated from the task description
487
+ if num_fresh_starts > 0:
488
+ logger.info(f"Generating {num_fresh_starts} 'fresh start' prompts based on task description (aiming for style: '{current_output_style_guidance[:30]}...')...")
489
+ fresh_start_user_prompt = f"""Here is a description of a task:
490
+ {task_desc_for_llm}
491
+
492
+ The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
493
+
494
+ Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
495
+ Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
496
+ Your response MUST be a valid JSON list of strings. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
497
+ Example of valid response: ["Prompt targeting specified style.", "Another prompt designed for the output style."]
498
+ """
499
+ try:
500
+ response_content = self._call_model(
501
+ prompt=fresh_start_user_prompt,
502
+ system_prompt=f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
503
+ is_reasoning=True
504
+ )
505
+ logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
506
+
507
+ cleaned_response_content = response_content.strip()
508
+ if cleaned_response_content.startswith("```json"):
509
+ cleaned_response_content = cleaned_response_content[7:]
510
+ if cleaned_response_content.endswith("```"):
511
+ cleaned_response_content = cleaned_response_content[:-3]
512
+ elif cleaned_response_content.startswith("```"):
513
+ cleaned_response_content = cleaned_response_content[3:]
514
+ if cleaned_response_content.endswith("```"):
515
+ cleaned_response_content = cleaned_response_content[:-3]
516
+ cleaned_response_content = cleaned_response_content.strip()
517
+
518
+ fresh_prompts = json.loads(cleaned_response_content)
519
+ if isinstance(fresh_prompts, list) and all(isinstance(p, str) for p in fresh_prompts) and fresh_prompts:
520
+ population.extend(fresh_prompts[:num_fresh_starts])
521
+ logger.info(f"Generated {len(fresh_prompts[:num_fresh_starts])} fresh prompts from LLM.")
522
+ else:
523
+ logger.warning(f"LLM response for fresh starts was not a valid list of strings or was empty: {cleaned_response_content}. Using fallbacks for fresh starts.")
524
+ population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
525
+ except json.JSONDecodeError as e_json:
526
+ logger.warning(f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response (after cleaning): '{cleaned_response_content if 'cleaned_response_content' in locals() else response_content}'. Using fallbacks for fresh starts.")
527
+ population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
528
+ except Exception as e:
529
+ logger.warning(f"Error generating fresh start prompts: {e}. Using fallbacks for fresh starts.")
530
+ population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
531
+
532
+ # Generate variations on the initial prompt for the remaining slots
533
+ # TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
534
+ if num_variations_on_initial > 0:
535
+ logger.info(f"Generating {num_variations_on_initial} variations of the initial prompt (aiming for style: '{current_output_style_guidance[:30]}...')...")
536
+ user_prompt_for_variation = f"""Initial prompt:
537
+ '''{initial_prompt}'''
538
+
539
+ Task context:
540
+ {task_desc_for_llm}
541
+ Desired output style from target LLM: '{current_output_style_guidance}'
542
+
543
+ Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
544
+ All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
545
+ For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
546
+
547
+ Return a JSON array of prompts with the following structure:
548
+ {{
549
+ "prompts": [
550
+ {{
551
+ "prompt": "alternative prompt 1 designed for the specified output style",
552
+ "strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
553
+ }}
554
+ // ... more prompts if num_variations_on_initial > 1
555
+ ]
556
+ }}
557
+ Ensure a good mix of variations, all targeting the specified output style from the end LLM.
558
+ """
559
+ try:
560
+ response_content_variations = self._call_model(
561
+ prompt=user_prompt_for_variation,
562
+ system_prompt=self.get_reasoning_system_prompt_for_variation(),
563
+ is_reasoning=True
564
+ )
565
+ logger.debug(f"Raw response for population variations: {response_content_variations}")
566
+ json_response_variations = json.loads(response_content_variations)
567
+ generated_prompts_variations = [p["prompt"] for p in json_response_variations.get("prompts", []) if isinstance(p, dict) and "prompt" in p]
568
+ if generated_prompts_variations:
569
+ population.extend(generated_prompts_variations[:num_variations_on_initial])
570
+ logger.info(f"Successfully parsed {len(generated_prompts_variations[:num_variations_on_initial])} variations from LLM response.")
571
+ else:
572
+ logger.warning("Could not parse 'prompts' list for variations. Using fallback for remaining.")
573
+ population.extend(self._generate_fallback_variations(initial_prompt, num_variations_on_initial))
574
+ except Exception as e:
575
+ logger.error(f"Error calling LLM for initial population variations: {e}. Using fallback for remaining.")
576
+ population.extend(self._generate_fallback_variations(initial_prompt, num_variations_on_initial))
577
+
578
+ # Ensure population is of the required size using unique prompts
579
+ # TODO Test with levenshtein distance
580
+ final_population_set = set()
581
+ final_population_list = []
582
+ for p in population:
583
+ if p not in final_population_set:
584
+ final_population_set.add(p)
585
+ final_population_list.append(p)
586
+
587
+ # If not enough unique prompts, fill with fallbacks (could be more sophisticated)
588
+ while len(final_population_list) < self.population_size and len(final_population_list) < num_to_generate_total +1:
589
+ fallback_prompt = initial_prompt + f" #fallback{len(final_population_list)}"
590
+ if fallback_prompt not in final_population_set:
591
+ final_population_list.append(fallback_prompt)
592
+ final_population_set.add(fallback_prompt)
593
+ else:
594
+ # Safeguard if initial_prompt itself is causing issues with uniqueness
595
+ fallback_prompt = f"Fallback prompt variation {random.randint(1000,9999)}"
596
+ if fallback_prompt not in final_population_set:
597
+ final_population_list.append(fallback_prompt)
598
+ final_population_set.add(fallback_prompt)
599
+ # Avoid infinite loop in extreme edge case
600
+ else: break
601
+
602
+ logger.info(f"Initialized population with {len(final_population_list)} prompts.")
603
+ # Return exactly population_size prompts if possible, or fewer if generation failed badly.
604
+ return final_population_list[:self.population_size]
605
+
606
+ def _generate_diverse_variation(
607
+ self,
608
+ base_prompt: str,
609
+ seen_prompts: set
610
+ ) -> str:
611
+ """Generate a new variation that's different from existing ones."""
612
+ max_attempts = 5
613
+ for _ in range(max_attempts):
614
+ # Try different mutation strategies
615
+ mutation_choice = random.random()
616
+ if mutation_choice < 0.3:
617
+ new_prompt = self._semantic_mutation(base_prompt)[0]
618
+ elif mutation_choice < 0.6:
619
+ new_prompt = self._structural_mutation(base_prompt)[0]
620
+ else:
621
+ new_prompt = self._word_level_mutation(base_prompt)[0]
622
+
623
+ # Check if this variation is sufficiently different
624
+ is_diverse = True
625
+ for existing in seen_prompts:
626
+ if Levenshtein.distance(str(new_prompt), existing) / max(len(str(new_prompt)), len(existing)) < 0.3:
627
+ is_diverse = False
628
+ break
629
+ if is_diverse:
630
+ return str(new_prompt)
631
+
632
+ # If we couldn't generate a diverse variation, create a simple one
633
+ return base_prompt + f" #v{len(seen_prompts)}"
634
+
635
+ def _generate_fallback_variations(
636
+ self,
637
+ initial_prompt: str,
638
+ num_variations: int
639
+ ) -> List[str]:
640
+ """Generate fallback variations when LLM generation fails."""
641
+ variations = []
642
+ words = initial_prompt.split()
643
+
644
+ for i in range(num_variations):
645
+ if len(words) > 3:
646
+ # Shuffle words
647
+ shuffled = words.copy()
648
+ random.shuffle(shuffled)
649
+ variations.append(' '.join(shuffled))
650
+ else:
651
+ # Add simple variations
652
+ variations.append(initial_prompt + f" #v{i}")
653
+
654
+ return variations
655
+
656
+ def optimize_prompt(
657
+ self,
658
+ dataset: Union[str, opik.Dataset],
659
+ metric_config: MetricConfig,
660
+ task_config: TaskConfig,
661
+ experiment_config: Optional[Dict] = None,
662
+ n_samples: Optional[int] = None,
663
+ auto_continue: bool = False,
664
+ **kwargs,
665
+ ) -> OptimizationResult:
666
+ self.llm_call_counter = 0
667
+ self._history = []
668
+ self._current_optimization_id = None
669
+ self._current_generation = 0
670
+ self._best_fitness_history = []
671
+ self._generations_without_improvement = 0
672
+ self._llm_cache.clear()
673
+ self._current_population = []
674
+ self._generations_without_overall_improvement = 0
675
+
676
+ # Determine final output_style_guidance
677
+ effective_output_style_guidance = self.output_style_guidance
678
+ if self.infer_output_style and \
679
+ (self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
680
+ # If user wants inference AND hasn't provided a specific custom guidance
681
+ inferred_style = self._infer_output_style_from_dataset(dataset, task_config)
682
+ if inferred_style:
683
+ effective_output_style_guidance = inferred_style
684
+ # Update self.output_style_guidance for this run so dynamic prompt methods use it
685
+ self.output_style_guidance = inferred_style
686
+ else:
687
+ logger.warning("Failed to infer output style, using default or user-provided guidance.")
688
+
689
+ # Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
690
+ # (It might have been None if user passed None and infer_output_style was False)
691
+ if self.output_style_guidance is None:
692
+ # Fallback if still None
693
+ self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
694
+
695
+ # The methods like get_reasoning_system_prompt_for_variation will now use the potentially updated self.output_style_guidance
696
+ log_prefix = "DEAP MOO" if self.enable_moo else "DEAP SO"
697
+ logger.info(f"Starting {log_prefix} Evolutionary Optimization for prompt: {task_config.instruction_prompt[:100]}...")
698
+ logger.info(f"Population: {self.population_size}, Generations: {self.num_generations}, Mutation: {self.mutation_rate}, Crossover: {self.crossover_rate}")
699
+
700
+ opik_dataset_obj: opik.Dataset
701
+ if isinstance(dataset, str):
702
+ opik_dataset_obj = self._opik_client.get_dataset(dataset)
703
+ else:
704
+ opik_dataset_obj = dataset
705
+
706
+ opik_optimization_run = None
707
+ try:
708
+ opik_optimization_run = self._opik_client.create_optimization(
709
+ dataset_name=opik_dataset_obj.name, objective_name=metric_config.metric.name
710
+ )
711
+ self._current_optimization_id = opik_optimization_run.id
712
+ logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
713
+ except Exception as e:
714
+ logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
715
+
716
+ # Use of multi-objective fitness function or single-objective fitness function
717
+ if self.enable_moo:
718
+ def _deap_evaluate_individual_fitness(
719
+ individual_prompt_str: str
720
+ ) -> Tuple[float, float]:
721
+ primary_fitness_score = self.evaluate_prompt(
722
+ dataset=opik_dataset_obj, metric_config=metric_config, task_config=task_config,
723
+ prompt=str(individual_prompt_str), n_samples=n_samples,
724
+ experiment_config=(experiment_config or {}).copy(),
725
+ optimization_id=self._current_optimization_id, verbose=0
726
+ )
727
+ prompt_length = float(len(str(individual_prompt_str)))
728
+ logger.debug(f"Evaluated MOO individual '{str(individual_prompt_str)[:50]}...' -> Primary Score: {primary_fitness_score:.4f}, Length: {prompt_length}")
729
+ return (primary_fitness_score, prompt_length)
730
+ else:
731
+ # Single-objective
732
+ def _deap_evaluate_individual_fitness(
733
+ individual_prompt_str: str
734
+ ) -> Tuple[float,]:
735
+ fitness_score = self.evaluate_prompt(
736
+ dataset=opik_dataset_obj, metric_config=metric_config, task_config=task_config,
737
+ prompt=str(individual_prompt_str), n_samples=n_samples,
738
+ experiment_config=(experiment_config or {}).copy(),
739
+ optimization_id=self._current_optimization_id, verbose=0
740
+ )
741
+ logger.debug(f"Evaluated SO individual '{str(individual_prompt_str)[:50]}...' -> Score: {fitness_score:.4f}")
742
+ return (fitness_score,)
743
+
744
+ # Register the fitness function with DEAP
745
+ self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
746
+
747
+ initial_prompt_strings = self._initialize_population(
748
+ initial_prompt=task_config.instruction_prompt, task_config=task_config
749
+ )
750
+ deap_population = [creator.Individual(p_str) for p_str in initial_prompt_strings]
751
+ deap_population = deap_population[:self.population_size]
752
+
753
+ initial_eval_result = _deap_evaluate_individual_fitness(task_config.instruction_prompt)
754
+ initial_primary_score = initial_eval_result[0]
755
+ initial_length = initial_eval_result[1] if self.enable_moo else float(len(task_config.instruction_prompt))
756
+
757
+ best_primary_score_overall = initial_primary_score
758
+ best_prompt_overall = task_config.instruction_prompt
759
+ if self.enable_moo:
760
+ logger.info(f"Initial prompt '{task_config.instruction_prompt[:100]}...' -> Primary Score: {initial_primary_score:.4f}, Length: {initial_length}")
761
+ else:
762
+ logger.info(f"Initial prompt '{task_config.instruction_prompt[:100]}...' score: {initial_primary_score:.4f}")
763
+
764
+ # Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
765
+ if self.enable_moo:
766
+ hof = tools.ParetoFront()
767
+ stats_primary = tools.Statistics(lambda ind: ind.fitness.values[0])
768
+ stats_length = tools.Statistics(lambda ind: ind.fitness.values[1])
769
+ stats_primary.register("avg_score", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
770
+ stats_primary.register("max_score", max)
771
+ stats_length.register("avg_len", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
772
+ stats_length.register("min_len", min)
773
+ mstats = tools.MultiStatistics(score=stats_primary, length=stats_length)
774
+ logbook_header_stats = mstats.fields
775
+ else:
776
+ # Single-objective
777
+ hof = tools.HallOfFame(self.DEFAULT_HALL_OF_FAME_SIZE)
778
+ stats = tools.Statistics(lambda ind: ind.fitness.values[0])
779
+ stats.register("avg", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
780
+ stats.register("std", lambda x: (sum((xi - (sum(x) / len(x) if len(x) > 0 else 0))**2 for xi in x) / len(x))**0.5 if len(x) > 1 else 0)
781
+ stats.register("min", min)
782
+ stats.register("max", max)
783
+ logbook_header_stats = stats.fields
784
+
785
+ logbook = tools.Logbook()
786
+ logbook.header = ["gen", "evals"] + logbook_header_stats
787
+
788
+ # Evaluate the initial population
789
+ fitnesses = list(map(self.toolbox.evaluate, deap_population))
790
+ for ind, fit in zip(deap_population, fitnesses):
791
+ ind.fitness.values = fit
792
+
793
+ hof.update(deap_population)
794
+ record_stats = mstats if self.enable_moo else stats
795
+ record = record_stats.compile(deap_population) if record_stats else {}
796
+ logbook.record(gen=0, evals=len(deap_population), **record)
797
+ if self.verbose >= 1:
798
+ print(logbook.stream)
799
+
800
+ if hof and len(hof) > 0:
801
+ if self.enable_moo:
802
+ current_best_for_primary = max(hof, key=lambda ind: ind.fitness.values[0])
803
+ best_primary_score_overall = current_best_for_primary.fitness.values[0]
804
+ best_prompt_overall = str(current_best_for_primary)
805
+ else:
806
+ # Single-objective
807
+ current_best_on_front = hof[0]
808
+ best_primary_score_overall = current_best_on_front.fitness.values[0]
809
+
810
+ if self.enable_moo:
811
+ logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {best_prompt_overall[:100]}...")
812
+ else:
813
+ logger.info(f"Gen {0}: New best score: {best_primary_score_overall:.4f}")
814
+
815
+ # Simplified history logging for this transition
816
+ initial_round_data = OptimizationRound(
817
+ round_number=0,
818
+ current_prompt=best_prompt_overall, # Representative best
819
+ current_score=best_primary_score_overall,
820
+ generated_prompts=[{"prompt": best_prompt_overall, "score": best_primary_score_overall, "trial_scores": [best_primary_score_overall]}],
821
+ best_prompt=best_prompt_overall,
822
+ best_score=best_primary_score_overall,
823
+ improvement=0.0
824
+ ).dict()
825
+ self._add_to_history(initial_round_data)
826
+
827
+ pbar_desc = f"{log_prefix} Evolutionary Optimization"
828
+ pbar_postfix_key = "best_primary_score" if self.enable_moo else "best_score"
829
+ pbar = tqdm(
830
+ total=self.num_generations,
831
+ desc=pbar_desc,
832
+ unit="gen",
833
+ disable=self.verbose < 1,
834
+ postfix={pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter}
835
+ )
836
+
837
+ gen = 0
838
+ for gen_idx in range(1, self.num_generations + 1):
839
+ gen = gen_idx
840
+ self._current_generation = gen
841
+ pbar.set_postfix({pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter})
842
+ previous_best_primary_score_for_gen = best_primary_score_overall
843
+
844
+ # Population restart logic
845
+ current_pop_best_primary = 0.0
846
+ if deap_population and deap_population[0].fitness.valid:
847
+ current_pop_best_primary = max(ind.fitness.values[0] for ind in deap_population if ind.fitness.valid)
848
+
849
+ if self._best_fitness_history and current_pop_best_primary < self._best_fitness_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD):
850
+ self._generations_without_improvement += 1
851
+ else:
852
+ self._generations_without_improvement = 0
853
+ self._best_fitness_history.append(current_pop_best_primary)
854
+
855
+ if self._generations_without_improvement >= self.DEFAULT_RESTART_GENERATIONS:
856
+ logger.info(f"Detected stagnation in primary objective at gen {gen}. Restarting population...")
857
+ elites_for_restart = list(hof) if self.enable_moo else list(tools.selBest(deap_population, self.elitism_size))
858
+ seed_prompt_for_restart = str(max(elites_for_restart, key=lambda ind: ind.fitness.values[0])) if elites_for_restart else best_prompt_overall
859
+
860
+ new_population_strings = self._initialize_population(initial_prompt=seed_prompt_for_restart, task_config=task_config)
861
+ deap_population = [creator.Individual(p_str) for p_str in new_population_strings]
862
+ self._generations_without_improvement = 0
863
+ fitnesses_new = list(map(self.toolbox.evaluate, deap_population))
864
+ for ind, fit in zip(deap_population, fitnesses_new):
865
+ ind.fitness.values = fit
866
+ # Offspring will be selected from this new population in the next step
867
+
868
+ # Standard DEAP evolutionary algorithm steps
869
+ if self.enable_moo:
870
+ # NSGA-II is used for MOO
871
+ offspring = self.toolbox.select(deap_population, self.population_size)
872
+ else:
873
+ # Single-objective: Elitism + Selection
874
+ elites = tools.selBest(deap_population, self.elitism_size)
875
+ selected_offspring = self.toolbox.select(deap_population, len(deap_population) - self.elitism_size)
876
+ offspring = elites + selected_offspring
877
+
878
+ # Set up the offspring for the next generation
879
+ offspring = list(map(self.toolbox.clone, offspring))
880
+ for child1, child2 in zip(offspring[::2], offspring[1::2]):
881
+ if random.random() < self.crossover_rate:
882
+ self.toolbox.mate(child1, child2)
883
+ del child1.fitness.values
884
+ del child2.fitness.values
885
+
886
+ # Mutate the offspring
887
+ current_mutation_rate = self._get_adaptive_mutation_rate()
888
+ for mutant in offspring:
889
+ if random.random() < current_mutation_rate:
890
+ self.toolbox.mutate(mutant, task_config=task_config)
891
+ del mutant.fitness.values
892
+
893
+ # Evaluate the offspring
894
+ invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
895
+ fitnesses_eval = map(self.toolbox.evaluate, invalid_ind)
896
+ for ind, fit in zip(invalid_ind, fitnesses_eval):
897
+ ind.fitness.values = fit
898
+
899
+ # Update the hall of fame
900
+ hof.update(offspring)
901
+ deap_population[:] = offspring # Replace population
902
+
903
+ # Update overall best score and prompt (based on primary objective for consistency)
904
+ if hof and len(hof) > 0:
905
+ if self.enable_moo:
906
+ current_best_on_front = max(hof, key=lambda ind: ind.fitness.values[0])
907
+ updated_best_primary_score = current_best_on_front.fitness.values[0]
908
+ else:
909
+ # Single-objective
910
+ current_best_on_front = hof[0]
911
+ updated_best_primary_score = current_best_on_front.fitness.values[0]
912
+
913
+ if updated_best_primary_score > best_primary_score_overall:
914
+ best_primary_score_overall = updated_best_primary_score
915
+ best_prompt_overall = str(current_best_on_front)
916
+ logger.info(f"Gen {gen}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {best_prompt_overall[:100]}...")
917
+ self._generations_without_overall_improvement = 0
918
+ elif updated_best_primary_score == previous_best_primary_score_for_gen:
919
+ # Check against score at start of this gen's logic
920
+ self._generations_without_overall_improvement += 1
921
+ else:
922
+ # Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
923
+ self._generations_without_overall_improvement += 1
924
+ else:
925
+ # Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
926
+ self._generations_without_overall_improvement += 1
927
+
928
+ record = record_stats.compile(deap_population) if record_stats else {}
929
+ logbook.record(gen=gen, evals=len(invalid_ind), **record)
930
+ if self.verbose >= 1:
931
+ print(logbook.stream)
932
+
933
+ # History logging for this transition
934
+ # FIXME: Use model.dump() instead of dict()
935
+ gen_round_data = OptimizationRound(
936
+ round_number=gen,
937
+ current_prompt=best_prompt_overall, # Representative best
938
+ current_score=best_primary_score_overall,
939
+ generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
940
+ best_prompt=best_prompt_overall,
941
+ best_score=best_primary_score_overall,
942
+ improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
943
+ ).dict()
944
+ self._add_to_history(gen_round_data)
945
+ pbar.update(1)
946
+
947
+ if self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS:
948
+ logger.info(f"Overall best score has not improved for {self.DEFAULT_EARLY_STOPPING_GENERATIONS} generations. Stopping early at gen {gen}.")
949
+ break
950
+
951
+ pbar.close()
952
+ logger.info(f"\n{log_prefix} Evolutionary Optimization finished after {gen} generations.")
953
+ stopped_early_flag = self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
954
+ final_details = {}
955
+ initial_score_for_display = initial_primary_score
956
+
957
+ if self.enable_moo:
958
+ final_results_log = "Pareto Front Solutions:\n"
959
+ if hof and len(hof) > 0:
960
+ sorted_hof = sorted(hof, key=lambda ind: ind.fitness.values[0], reverse=True)
961
+ for i, sol in enumerate(sorted_hof):
962
+ final_results_log += f" Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
963
+ best_overall_solution = sorted_hof[0]
964
+ final_best_prompt = str(best_overall_solution)
965
+ final_primary_score = best_overall_solution.fitness.values[0]
966
+ final_length = best_overall_solution.fitness.values[1]
967
+ logger.info(final_results_log)
968
+ logger.info(f"Representative best prompt (highest primary score from Pareto front): '{final_best_prompt}'")
969
+ logger.info(f" Primary Score ({metric_config.metric.name}): {final_primary_score:.4f}")
970
+ logger.info(f" Length: {final_length:.0f}")
971
+ final_details.update({
972
+ "initial_primary_score": initial_primary_score,
973
+ "initial_length": initial_length,
974
+ "final_prompt_representative": final_best_prompt,
975
+ "final_primary_score_representative": final_primary_score,
976
+ "final_length_representative": final_length,
977
+ "pareto_front_solutions": [
978
+ {"prompt": str(ind), "score": ind.fitness.values[0], "length": ind.fitness.values[1]}
979
+ for ind in hof
980
+ ] if hof else []
981
+ })
982
+ else:
983
+ # MOO: ParetoFront is empty. Reporting last known best and fallback values
984
+ logger.warning("MOO: ParetoFront is empty. Reporting last known best.")
985
+ final_best_prompt = best_prompt_overall
986
+ final_primary_score = best_primary_score_overall
987
+ final_length = float(len(final_best_prompt))
988
+ final_details.update({"initial_primary_score": initial_primary_score, "initial_length": initial_length,
989
+ "final_prompt_representative": final_best_prompt, "final_primary_score_representative": final_primary_score,
990
+ "final_length_representative": final_length, "pareto_front_solutions": []})
991
+ else:
992
+ # Single-objective
993
+ final_best_prompt = best_prompt_overall
994
+ final_primary_score = best_primary_score_overall
995
+ logger.info(f"Final best prompt from Hall of Fame: '{final_best_prompt}'")
996
+ logger.info(f"Final best score ({metric_config.metric.name}): {final_primary_score:.4f}")
997
+ final_details.update({
998
+ "initial_prompt": task_config.instruction_prompt,
999
+ "initial_score": initial_primary_score,
1000
+ "initial_score_for_display": initial_primary_score,
1001
+ "final_prompt": final_best_prompt,
1002
+ "final_score": final_primary_score,
1003
+ })
1004
+
1005
+ logger.info(f"Total LLM calls during optimization: {self.llm_call_counter}")
1006
+ if opik_optimization_run:
1007
+ try:
1008
+ opik_optimization_run.update(status="completed")
1009
+ logger.info(f"Opik Optimization run {self._current_optimization_id} status updated to completed.")
1010
+ except Exception as e:
1011
+ logger.warning(f"Failed to update Opik Optimization run status: {e}")
1012
+
1013
+ # Add final details
1014
+ final_details.update({
1015
+ "total_generations_run": gen,
1016
+ "population_size": self.population_size,
1017
+ "mutation_probability": self.mutation_rate,
1018
+ "crossover_probability": self.crossover_rate,
1019
+ "elitism_size": self.elitism_size if not self.enable_moo else "N/A (MOO uses NSGA-II)",
1020
+ "adaptive_mutation": self.adaptive_mutation,
1021
+ "deap_logbook": logbook.stream if logbook else "Not available",
1022
+ "task_config": task_config.dict(),
1023
+ "metric_config": metric_config.dict(),
1024
+ "model": self.model,
1025
+ "moo_enabled": self.enable_moo,
1026
+ "llm_crossover_enabled": self.enable_llm_crossover,
1027
+ "seed": self.seed,
1028
+ "prompt_type": "single_string_ga",
1029
+ "initial_score_for_display": initial_score_for_display,
1030
+ "temperature": self.model_kwargs.get("temperature"),
1031
+ "stopped_early": stopped_early_flag,
1032
+ "rounds": self.get_history(),
1033
+ "user_output_style_guidance": self.output_style_guidance,
1034
+ "infer_output_style_requested": self.infer_output_style,
1035
+ "final_effective_output_style_guidance": effective_output_style_guidance,
1036
+ "infer_output_style": self.infer_output_style,
1037
+ })
1038
+
1039
+ # Return the OptimizationResult
1040
+ return OptimizationResult(
1041
+ optimizer=self.__class__.__name__,
1042
+ prompt=final_best_prompt,
1043
+ score=final_primary_score,
1044
+ metric_name=metric_config.metric.name,
1045
+ details=final_details,
1046
+ history=self.get_history(),
1047
+ llm_calls=self.llm_call_counter
1048
+ )
1049
+
1050
+ @_throttle.rate_limited(_rate_limiter)
1051
+ def _call_model(
1052
+ self,
1053
+ prompt: str,
1054
+ system_prompt: Optional[str] = None,
1055
+ is_reasoning: bool = False,
1056
+ optimization_id: Optional[str] = None,
1057
+ ) -> str:
1058
+ """Call the model with the given prompt and return the response."""
1059
+ try:
1060
+ # Basic LLM parameters
1061
+ llm_config_params = {
1062
+ "temperature": getattr(self, "temperature", 0.3),
1063
+ "max_tokens": getattr(self, "max_tokens", 1000),
1064
+ "top_p": getattr(self, "top_p", 1.0),
1065
+ "frequency_penalty": getattr(self, "frequency_penalty", 0.0),
1066
+ "presence_penalty": getattr(self, "presence_penalty", 0.0),
1067
+ }
1068
+
1069
+ # Prepare metadata for opik
1070
+ metadata_for_opik = {}
1071
+ if self.project_name:
1072
+ metadata_for_opik["project_name"] = self.project_name
1073
+ metadata_for_opik["opik"] = {"project_name": self.project_name}
1074
+
1075
+ if optimization_id:
1076
+ if "opik" in metadata_for_opik:
1077
+ metadata_for_opik["opik"]["optimization_id"] = optimization_id
1078
+
1079
+ metadata_for_opik["optimizer_name"] = self.__class__.__name__
1080
+ metadata_for_opik["opik_call_type"] = "reasoning" if is_reasoning else "evaluation_llm_task_direct"
1081
+
1082
+ if metadata_for_opik:
1083
+ llm_config_params["metadata"] = metadata_for_opik
1084
+
1085
+ messages = []
1086
+ if system_prompt:
1087
+ messages.append({"role": "system", "content": system_prompt})
1088
+ messages.append({"role": "user", "content": prompt})
1089
+
1090
+ # Pass llm_config_params to the Opik monitor
1091
+ final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
1092
+ llm_config_params.copy()
1093
+ )
1094
+
1095
+ logger.debug(
1096
+ f"Calling model '{self.model}' with messages: {messages}, "
1097
+ f"final params for litellm (from monitor): {final_call_params}"
1098
+ )
1099
+
1100
+ response = litellm.completion(
1101
+ model=self.model, messages=messages, **final_call_params
1102
+ )
1103
+ return response.choices[0].message.content
1104
+ except litellm.exceptions.RateLimitError as e:
1105
+ logger.error(f"LiteLLM Rate Limit Error: {e}")
1106
+ raise
1107
+ except litellm.exceptions.APIConnectionError as e:
1108
+ logger.error(f"LiteLLM API Connection Error: {e}")
1109
+ raise
1110
+ except litellm.exceptions.ContextWindowExceededError as e:
1111
+ logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
1112
+ raise
1113
+ except Exception as e:
1114
+ logger.error(f"Error calling model '{self.model}': {type(e).__name__} - {e}")
1115
+ raise
1116
+
1117
+ def evaluate_prompt(
1118
+ self,
1119
+ dataset: Union[str, opik.Dataset],
1120
+ metric_config: MetricConfig,
1121
+ task_config: TaskConfig,
1122
+ prompt: str,
1123
+ n_samples: Optional[int] = None,
1124
+ dataset_item_ids: Optional[List[str]] = None,
1125
+ experiment_config: Optional[Dict] = None,
1126
+ optimization_id: Optional[str] = None,
1127
+ verbose: int = 0,
1128
+ ) -> float:
1129
+ """
1130
+ Evaluate a single prompt (individual) against the dataset.
1131
+ Adapted from MetaPromptOptimizer._evaluate_prompt.
1132
+ """
1133
+ effective_verbose = self.verbose if verbose == 0 else verbose
1134
+
1135
+ if isinstance(dataset, str):
1136
+ # This should ideally be done once in optimize_prompt if dataset is a string
1137
+ # but if called standalone, we need to handle it.
1138
+ # TODO Move to base class
1139
+ opik_eval_dataset = self._opik_client.get_dataset(dataset)
1140
+ else:
1141
+ opik_eval_dataset = dataset
1142
+
1143
+ total_items = len(opik_eval_dataset.get_items())
1144
+
1145
+ # Determine subset_size for this evaluation run
1146
+ # TODO Move to dataset utils
1147
+ if dataset_item_ids:
1148
+ subset_size = len(dataset_item_ids)
1149
+ logger.debug(f"Using provided {subset_size} dataset_item_ids for evaluation.")
1150
+ elif n_samples is not None:
1151
+ if n_samples > total_items:
1152
+ logger.warning(
1153
+ f"Requested n_samples ({n_samples}) for individual evaluation is larger than dataset size ({total_items}). Using full dataset."
1154
+ )
1155
+ subset_size = None
1156
+ elif n_samples <= 0:
1157
+ logger.warning(
1158
+ f"Requested n_samples ({n_samples}) is <=0. Using full dataset for this evaluation."
1159
+ )
1160
+ subset_size = None
1161
+ else:
1162
+ subset_size = n_samples
1163
+ logger.debug(f"Using specified n_samples: {subset_size} items for this evaluation run.")
1164
+ else:
1165
+ # Default behavior if no n_samples and no dataset_item_ids are given for this specific call
1166
+ # This case should be rare if n_samples is passed down from optimize_prompt
1167
+ subset_size = min(total_items, min(20, max(10, int(total_items * 0.2))))
1168
+ logger.debug(
1169
+ f"Using automatic subset size for this evaluation: {subset_size} items (20% of {total_items} total items)"
1170
+ )
1171
+
1172
+ current_experiment_config = experiment_config or {}
1173
+ current_experiment_config = {
1174
+ **current_experiment_config,
1175
+ **{
1176
+ "optimizer": self.__class__.__name__,
1177
+ "metric": metric_config.metric.name,
1178
+ "dataset": opik_eval_dataset.name,
1179
+ "configuration": {
1180
+ "prompt_evaluated": prompt,
1181
+ "n_samples_for_eval": subset_size if dataset_item_ids is None else len(dataset_item_ids),
1182
+ "total_dataset_items": total_items,
1183
+ },
1184
+ },
1185
+ }
1186
+
1187
+ def llm_task(
1188
+ dataset_item: Dict[str, Any]
1189
+ ) -> Dict[str, str]:
1190
+ if hasattr(dataset_item, "to_dict"):
1191
+ dataset_item = dataset_item.to_dict()
1192
+
1193
+ for input_key in task_config.input_dataset_fields:
1194
+ if input_key not in dataset_item:
1195
+ raise ValueError(f"Input field '{input_key}' not found in dataset sample: {dataset_item}")
1196
+ if task_config.output_dataset_field not in dataset_item:
1197
+ raise ValueError(f"Output field '{task_config.output_dataset_field}' not found in dataset sample: {dataset_item}")
1198
+
1199
+ prompt_for_llm: str
1200
+ field_mapping = {
1201
+ field: dataset_item[field]
1202
+ for field in task_config.input_dataset_fields
1203
+ if field in dataset_item
1204
+ }
1205
+
1206
+ if getattr(task_config, "use_chat_prompt", False):
1207
+ candidate_template = Template(prompt)
1208
+ user_content_parts = []
1209
+ for field_name in task_config.input_dataset_fields:
1210
+ if field_name in dataset_item:
1211
+ user_content_parts.append(f"{field_name.capitalize()}: {dataset_item[field_name]}")
1212
+ user_content = "\n".join(user_content_parts)
1213
+
1214
+ raw_model_output = self._call_model(
1215
+ prompt=user_content,
1216
+ system_prompt=prompt,
1217
+ is_reasoning=False
1218
+ )
1219
+
1220
+ else:
1221
+ input_clauses = []
1222
+ for field_name in task_config.input_dataset_fields:
1223
+ if field_name in dataset_item:
1224
+ input_clauses.append(
1225
+ f"{field_name.capitalize()}: {dataset_item[field_name]}"
1226
+ )
1227
+ item_specific_inputs_str = "\n".join(input_clauses)
1228
+ prompt_for_llm = f"{prompt}\n\n{item_specific_inputs_str}"
1229
+
1230
+ raw_model_output = self._call_model(
1231
+ prompt=prompt_for_llm,
1232
+ system_prompt=None,
1233
+ is_reasoning=False
1234
+ )
1235
+
1236
+ cleaned_model_output = raw_model_output.strip()
1237
+ output_field = task_config.output_dataset_field
1238
+ prefixes_to_strip = [f"{output_field.capitalize()}:", f"{output_field}:", "Answer:"]
1239
+ for prefix in prefixes_to_strip:
1240
+ if cleaned_model_output.lower().startswith(prefix.lower()):
1241
+ cleaned_model_output = cleaned_model_output[len(prefix):].strip()
1242
+ break
1243
+
1244
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output}
1245
+
1246
+ logger.debug(
1247
+ f"Starting evaluation for a prompt with {subset_size if subset_size else 'all'} samples (or specific IDs) for metric: {metric_config.metric.name}"
1248
+ )
1249
+
1250
+ # Evaluate the prompt
1251
+ score = task_evaluator.evaluate(
1252
+ dataset=opik_eval_dataset,
1253
+ dataset_item_ids=dataset_item_ids,
1254
+ metric_config=metric_config,
1255
+ evaluated_task=llm_task,
1256
+ num_threads=self.num_threads,
1257
+ project_name=self.project_name,
1258
+ n_samples=subset_size if dataset_item_ids is None else None,
1259
+ experiment_config=current_experiment_config,
1260
+ optimization_id=optimization_id,
1261
+ # FIXME: Hack for verbose till its merged
1262
+ #verbose=effective_verbose,
1263
+ )
1264
+ logger.debug(f"Evaluation score for prompt: {score:.4f}")
1265
+ return score
1266
+
1267
+ def _llm_deap_crossover(
1268
+ self,
1269
+ ind1: "creator.Individual",
1270
+ ind2: "creator.Individual"
1271
+ ) -> Tuple["creator.Individual", "creator.Individual"]:
1272
+ """Perform crossover by asking an LLM to blend two parent prompts."""
1273
+ parent1_str = str(ind1)
1274
+ parent2_str = str(ind2)
1275
+ current_output_style_guidance = self.output_style_guidance
1276
+
1277
+ user_prompt_for_llm_crossover = f"""Parent Prompt 1:
1278
+ '''{parent1_str}'''
1279
+
1280
+ Parent Prompt 2:
1281
+ '''{parent2_str}'''
1282
+
1283
+ Desired output style from target LLM for children prompts: '{current_output_style_guidance}'
1284
+
1285
+ Please generate one or two child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
1286
+ Follow the instructions provided in the system prompt regarding the JSON output format ({{"children_prompts": ["child1", ... ]}}).
1287
+ """
1288
+ try:
1289
+ logger.debug(f"Attempting LLM-driven crossover between: '{parent1_str[:50]}...' and '{parent2_str[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'")
1290
+ response_content = self._call_model(
1291
+ prompt=user_prompt_for_llm_crossover,
1292
+ system_prompt=self.get_llm_crossover_system_prompt(),
1293
+ is_reasoning=True
1294
+ )
1295
+ logger.debug(f"Raw LLM response for crossover: {response_content}")
1296
+
1297
+ json_response = json.loads(response_content)
1298
+ children_strings = json_response.get("children_prompts", [])
1299
+
1300
+ if not children_strings or not isinstance(children_strings, list) or not all(isinstance(cs, str) for cs in children_strings):
1301
+ logger.warning("LLM Crossover: Malformed or empty children_prompts list. Falling back.")
1302
+ raise ValueError("Malformed LLM crossover response")
1303
+
1304
+ child1_str = children_strings[0]
1305
+ child2_str = children_strings[1] if len(children_strings) > 1 else self._deap_mutation(creator.Individual(parent2_str), task_config=None)[0] # task_config might not be available or needed here for simple mutation
1306
+
1307
+ logger.debug(f"LLM Crossover generated child1: {child1_str[:50]}... Child2: {child2_str[:50]}...")
1308
+ return creator.Individual(child1_str), creator.Individual(str(child2_str))
1309
+
1310
+ except Exception as e:
1311
+ logger.warning(f"LLM-driven crossover failed: {e}. Falling back to standard crossover.")
1312
+ return self._deap_crossover(ind1, ind2)
1313
+
1314
+ def _get_task_description_for_llm(
1315
+ self,
1316
+ task_config: TaskConfig
1317
+ ) -> str:
1318
+ """Generates a concise task description for use in LLM prompts for fresh generation or radical innovation."""
1319
+ input_fields_str = ", ".join(task_config.input_dataset_fields)
1320
+ output_field_str = task_config.output_dataset_field
1321
+ description = f"Task: Given input(s) from field(s) '{input_fields_str}', generate a response for the field '{output_field_str}'. "
1322
+ description += f"The original high-level instruction being optimized is: '{task_config.instruction_prompt}'. "
1323
+ description += "The goal is to create an effective prompt that guides a language model to perform this task well."
1324
+ return description
1325
+
1326
+ def get_reasoning_system_prompt_for_variation(self) -> str:
1327
+ return f"""You are an expert prompt engineer specializing in creating diverse and effective prompts. Given an initial prompt, your task is to generate a diverse set of alternative prompts.
1328
+
1329
+ For each prompt variation, consider:
1330
+ 1. Different levels of specificity and detail, including significantly more detailed and longer versions.
1331
+ 2. Various ways to structure the instruction, exploring more complex sentence structures and phrasings.
1332
+ 3. Alternative phrasings that maintain the core intent but vary in style and complexity.
1333
+ 4. Different emphasis on key components, potentially elaborating on them.
1334
+ 5. Various ways to express constraints or requirements.
1335
+ 6. Different approaches to clarity and conciseness, but also explore more verbose and explanatory styles.
1336
+ 7. Alternative ways to guide the model's response format.
1337
+ 8. Consider variations that are substantially longer and more descriptive than the original.
1338
+
1339
+ The generated prompts should guide a target LLM to produce outputs in the following style: '{self.output_style_guidance}'
1340
+
1341
+ Return a JSON array of prompts with the following structure:
1342
+ {{
1343
+ "prompts": [
1344
+ {{
1345
+ "prompt": "alternative prompt 1",
1346
+ "strategy": "brief description of the variation strategy used, e.g., 'focused on eliciting specific output style'"
1347
+ }},
1348
+ {{
1349
+ "prompt": "alternative prompt 2",
1350
+ "strategy": "brief description of the variation strategy used"
1351
+ }}
1352
+ ]
1353
+ }}
1354
+ Each prompt variation should aim to get the target LLM to produce answers matching the desired style: '{self.output_style_guidance}'.
1355
+ """
1356
+
1357
+ def get_llm_crossover_system_prompt(self) -> str:
1358
+ return f"""You are an expert prompt engineer specializing in creating novel prompts by intelligently blending existing ones.
1359
+ Given two parent prompts, your task is to generate one or two new child prompts that effectively combine the strengths, styles, or core ideas of both parents.
1360
+ The children should be coherent and aim to explore a potentially more effective region of the prompt design space, with a key goal of eliciting responses from the target language model in the following style: '{self.output_style_guidance}'.
1361
+
1362
+ Consider the following when generating children:
1363
+ - Identify the key instructions, constraints, and desired output formats in each parent, paying attention to any hints about desired output style.
1364
+ - Explore ways to merge these elements such that the resulting prompt strongly guides the target LLM towards the desired output style.
1365
+ - You can create a child that is a direct blend, or one that takes a primary structure from one parent and incorporates specific elements from the other, always optimizing for clear instruction towards the desired output style.
1366
+ - If generating two children, try to make them distinct from each other and from the parents, perhaps by emphasizing different aspects of the parental combination that could lead to the desired output style.
1367
+
1368
+ Return a JSON object with a single key "children_prompts", which is a list of strings. Each string is a child prompt.
1369
+ Example for one child: {{"children_prompts": ["child prompt 1 designed for specified style"]}}
1370
+ Example for two children: {{"children_prompts": ["child prompt 1 for target style", "child prompt 2 also for target style"]}}
1371
+ Generate at least one child, and at most two. All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
1372
+ """
1373
+
1374
+ def get_radical_innovation_system_prompt(self) -> str:
1375
+ return f"""You are an expert prompt engineer and a creative problem solver.
1376
+ Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
1377
+ Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
1378
+ Consider clarity, specificity, constraints, and how to best guide the language model for the described task TO PRODUCE OUTPUTS IN THE FOLLOWING STYLE: '{self.output_style_guidance}'.
1379
+ Return only the new prompt string, with no preamble or explanation.
1380
+ """
1381
+
1382
+ def _infer_output_style_from_dataset(
1383
+ self,
1384
+ dataset: opik.Dataset,
1385
+ task_config: TaskConfig,
1386
+ n_examples: int = 5
1387
+ ) -> Optional[str]:
1388
+ """Analyzes dataset examples to infer the desired output style."""
1389
+ logger.info(f"Attempting to infer output style from up to {n_examples} dataset examples...")
1390
+ try:
1391
+ all_items = dataset.get_items()
1392
+ except Exception as e:
1393
+ logger.error(f"Failed to get items from dataset '{dataset.name}': {e}")
1394
+ return None
1395
+
1396
+ if not all_items:
1397
+ logger.warning(f"Dataset '{dataset.name}' is empty. Cannot infer output style.")
1398
+ return None
1399
+
1400
+ # Take the first n_examples
1401
+ items_to_process = all_items[:n_examples]
1402
+
1403
+ # Need at least a couple of examples for meaningful inference
1404
+ if len(items_to_process) < min(n_examples, 2):
1405
+ logger.warning(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
1406
+ return None
1407
+
1408
+ examples_str = ""
1409
+ for i, item_obj in enumerate(items_to_process):
1410
+ item_content = item_obj.content if hasattr(item_obj, 'content') else item_obj
1411
+ if not isinstance(item_content, dict):
1412
+ logger.warning(f"Dataset item {i} does not have a .content dictionary or is not a dict itself. Skipping item: {item_obj}")
1413
+ continue
1414
+
1415
+ input_parts = []
1416
+ for field in task_config.input_dataset_fields:
1417
+ if field in item_content:
1418
+ input_parts.append(f"{field.capitalize()}: {item_content[field]}")
1419
+ input_str = "\n".join(input_parts)
1420
+ output_str = item_content.get(task_config.output_dataset_field, "[NO OUTPUT FIELD FOUND]")
1421
+ examples_str += f"Example {i+1}:\nInput(s):\n{input_str}\nOutput: {output_str}\n---\n"
1422
+
1423
+ user_prompt_for_style_inference = f"""Please analyze the following input-output examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. This description will be used to guide other LLMs in generating and refining prompts.
1424
+
1425
+ {examples_str}
1426
+
1427
+ Based on these examples, what is the desired output style description?
1428
+ Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
1429
+ The description should be a single string that can be directly used as an instruction for another LLM.
1430
+ Return ONLY this descriptive string.
1431
+ """
1432
+ try:
1433
+ inferred_style = self._call_model(
1434
+ prompt=user_prompt_for_style_inference,
1435
+ system_prompt=self._INFER_STYLE_SYSTEM_PROMPT,
1436
+ is_reasoning=True
1437
+ )
1438
+ inferred_style = inferred_style.strip()
1439
+ if inferred_style:
1440
+ logger.info(f"Inferred output style: '{inferred_style}'")
1441
+ return inferred_style
1442
+ else:
1443
+ logger.warning("LLM returned empty string for inferred output style.")
1444
+ return None
1445
+ except Exception as e:
1446
+ logger.error(f"Error during output style inference: {e}")
1447
+ return None