PyPI - opik-optimizer - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

opik-optimizer 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

opik_optimizer/__init__.py +15 -26
opik_optimizer/base_optimizer.py +28 -44
opik_optimizer/data/hotpot-500.json +501 -1001
opik_optimizer/datasets/__init__.py +6 -7
opik_optimizer/datasets/hotpot_qa.py +2 -1
opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
opik_optimizer/mipro_optimizer/__init__.py +1 -1
opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
opik_optimizer/mipro_optimizer/utils.py +1 -23
opik_optimizer/optimization_config/chat_prompt.py +106 -0
opik_optimizer/optimization_config/configs.py +2 -21
opik_optimizer/optimization_config/mappers.py +1 -1
opik_optimizer/optimization_result.py +57 -85
opik_optimizer/reporting_utils.py +180 -0
opik_optimizer/task_evaluator.py +41 -26
opik_optimizer/utils.py +187 -3
{opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
opik_optimizer-0.9.0.dist-info/RECORD +48 -0
{opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
opik_optimizer/integrations/__init__.py +0 -0
opik_optimizer/meta_prompt_optimizer.py +0 -1151
opik_optimizer-0.8.0.dist-info/RECORD +0 -45
{opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
{opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0

opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py CHANGED Viewed

@@ -1,28 +1,31 @@
-from typing import Optional, Union, List, Dict, Any, Tuple
-import opik
-import logging
-import random
 import json
-from string import Template
+import logging
 import os
-import time
+import random
+from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, cast
 import Levenshtein
+import litellm
 import numpy as np
+import opik
-from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
-from opik_optimizer.optimization_config.configs import TaskConfig, MetricConfig
-from opik_optimizer.optimization_result import OptimizationResult
-from opik_optimizer import task_evaluator
-from opik_optimizer.optimization_config import mappers
-from opik.api_objects import opik_client
-from opik.environment import get_tqdm_for_current_environment
-from opik_optimizer import _throttle
-import litellm
+# DEAP imports
+from deap import base, tools
+from deap import creator as _creator
+from litellm import exceptions as litellm_exceptions
 from litellm.caching import Cache
+from litellm.types.caching import LiteLLMCacheType
+from opik.api_objects import opik_client, optimization
+from opik.environment import get_tqdm_for_current_environment
 from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
-# DEAP imports
-from deap import base, creator, tools, algorithms
+from opik_optimizer import _throttle, task_evaluator
+from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
+from opik_optimizer.optimization_config import chat_prompt, mappers
+from opik_optimizer.optimization_result import OptimizationResult
+from .. import utils
+from . import reporting
 logger = logging.getLogger(__name__)
 tqdm = get_tqdm_for_current_environment()
@@ -30,13 +33,25 @@ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
 # Using disk cache for LLM calls
 disk_cache_dir = os.path.expanduser("~/.litellm_cache")
-litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
+litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
+creator = cast(Any, _creator)  # type: ignore[assignment]
 class EvolutionaryOptimizer(BaseOptimizer):
     """
-    Optimizes prompts using a genetic algorithm approach.
-    Focuses on evolving the prompt text itself.
-    Can operate in single-objective or multi-objective mode.
+    The Evolutionary Optimizer can be used to optimize prompts using a 4 stage genetic algorithm
+    approach:
+    1. Generate a set of candidate prompts based on variations of the best prompts (exploitation) as
+    well as completely new prompts (exploration)
+    2. Evaluate the candidate prompts
+    3. Select the best prompts
+    4. Repeat until convergence
+    This algorithm is best used if you have a first draft prompt and would like to find a better
+    prompt.
+    Note: This algorithm is time consuming and can be expensive to run.
     """
     DEFAULT_POPULATION_SIZE = 30
@@ -78,7 +93,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
     def __init__(
         self,
         model: str,
-        project_name: Optional[str] = None,
+        project_name: str = "Optimization",
         population_size: int = DEFAULT_POPULATION_SIZE,
         num_generations: int = DEFAULT_NUM_GENERATIONS,
         mutation_rate: float = DEFAULT_MUTATION_RATE,
@@ -95,9 +110,26 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
         verbose: int = 1,
         **model_kwargs,
     ):
-        # FIXME: Hack for verbose till its merged
-        self.verbose = 1
+        """
+        Args:
+            model: The model to use for evaluation
+            project_name: Optional project name for tracking
+            population_size: Number of prompts in the population
+            num_generations: Number of generations to run
+            mutation_rate: Mutation rate for genetic operations
+            crossover_rate: Crossover rate for genetic operations
+            tournament_size: Tournament size for selection
+            num_threads: Number of threads for parallel evaluation
+            elitism_size: Number of elitism prompts
+            adaptive_mutation: Whether to use adaptive mutation
+            enable_moo: Whether to enable multi-objective optimization - When enable optimizes for both the supplied metric and the length of the prompt
+            enable_llm_crossover: Whether to enable LLM crossover
+            seed: Random seed for reproducibility
+            output_style_guidance: Output style guidance for prompts
+            infer_output_style: Whether to infer output style
+            verbose: Controls internal logging/progress bars (0=off, 1=on).
+            **model_kwargs: Additional model parameters
+        """
         # Initialize base class first
         super().__init__(model=model, project_name=project_name, **model_kwargs)
         self.population_size = population_size
@@ -122,12 +154,15 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
         self._llm_cache = {}
         self._current_population = []
         self._generations_without_overall_improvement = 0
+        self._best_primary_score_history: list[float] = []
+        self._gens_since_pop_improvement: int = 0
+        self.verbose = verbose
         if self.seed is not None:
             random.seed(self.seed)
             np.random.seed(self.seed)
             logger.info(f"Global random seed set to: {self.seed}")
-            # Note: DEAP tools generally respect random.seed().
+            # Note: DEAP tools generally respect random.seed().
             # TODO investigate if specific DEAP components require separate seeding
         if self.enable_moo:
@@ -142,7 +177,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
         if not hasattr(creator, "Individual") or getattr(creator.Individual, "fitness") != fitness_attr:
             if hasattr(creator, "Individual"):
                 del creator.Individual
-            creator.create("Individual", str, fitness=fitness_attr)
+            creator.create("Individual", list, fitness=fitness_attr)
         self.toolbox = base.Toolbox()
         self.toolbox.register("default_individual", lambda: creator.Individual("placeholder"))
@@ -206,7 +241,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
             return 0.0
         # Calculate average Levenshtein distance between all pairs
-        total_distance = 0
+        total_distance = 0.0
         count = 0
         for i in range(len(self._current_population)):
             for j in range(i + 1, len(self._current_population)):
@@ -221,18 +256,10 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
         return total_distance / count if count > 0 else 0.0
-    def _deap_crossover(
-            self,
-            ind1: "creator.Individual",
-            ind2: "creator.Individual"
-        ) -> Tuple["creator.Individual", "creator.Individual"]:
-        """Enhanced crossover operation that preserves semantic meaning.
-        Attempts chunk-level crossover first, then falls back to word-level.
-        """
-        str1_orig, str2_orig = str(ind1), str(ind2)
-        chunks1 = [chunk.strip() for chunk in str1_orig.split('.') if chunk.strip()]
-        chunks2 = [chunk.strip() for chunk in str2_orig.split('.') if chunk.strip()]
+    def _deap_crossover_chunking_strategy(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
+        chunks1 = [chunk.strip() for chunk in messages_1_str.split('.') if chunk.strip()]
+        chunks2 = [chunk.strip() for chunk in messages_2_str.split('.') if chunk.strip()]
         # Try chunk-level crossover if both parents have at least 2 chunks
         if len(chunks1) >= 2 and len(chunks2) >= 2:
@@ -247,35 +274,73 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
             child1_str = '. '.join(child1_chunks) + ('.' if child1_chunks else '')
             child2_str = '. '.join(child2_chunks) + ('.' if child2_chunks else '')
-            return creator.Individual(child1_str), creator.Individual(child2_str)
-        # Fallback to word-level crossover if chunk-level is not suitable
-        words1 = str1_orig.split()
-        words2 = str2_orig.split()
+            return child1_str, child2_str
+        else:
+            raise ValueError("Not enough chunks in either prompt for chunk-level crossover")
+    def _deap_crossover_word_level(self, messages_1_str: str, messages_2_str: str) -> Tuple[str, str]:
+        words1 = messages_1_str.split()
+        words2 = messages_2_str.split()
         # If either prompt is empty (no words), return parents
         if not words1 or not words2:
-            return ind1, ind2
+            return messages_1_str, messages_2_str
         min_word_len = min(len(words1), len(words2))
         # Need at least 2 words in the shorter prompt for a valid crossover point
         if min_word_len < 2:
-            return ind1, ind2
+            return messages_1_str, messages_2_str
         # Crossover point for words: 1 to min_word_len - 1
         point = random.randint(1, min_word_len - 1)
         child1_words = words1[:point] + words2[point:]
         child2_words = words2[:point] + words1[point:]
-        return creator.Individual(' '.join(child1_words)), creator.Individual(' '.join(child2_words))
+        return ' '.join(child1_words), ' '.join(child2_words)
+    def _deap_crossover(
+            self,
+            ind1: "creator.Individual",
+            ind2: "creator.Individual"
+        ) -> Tuple["creator.Individual", "creator.Individual"]:
+        """Enhanced crossover operation that preserves semantic meaning.
+        Attempts chunk-level crossover first, then falls back to word-level.
+        """
+        reporting.display_message("      Recombining prompts by mixing and matching words and sentences.", verbose=self.verbose)
+        messages_1_orig: List[Dict[Literal["role", "content"], str]] = ind1
+        messages_2_orig: List[Dict[Literal["role", "content"], str]] = ind2
+        for i, message_1 in enumerate(messages_1_orig):
+            role: str = message_1['role']
+            message_1_str: str = message_1['content']
+            # We check that the second message has enough AI messages and the correct role
+            if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]['role'] == role):
+                message_2 = messages_2_orig[i]
+                message_2_str: str = message_2['content']
+                try:
+                    child1_str, child2_str = self._deap_crossover_chunking_strategy(message_1_str, message_2_str)
+                except ValueError:
+                    child1_str, child2_str = self._deap_crossover_word_level(message_1_str, message_2_str)
+                # Update the message content
+                messages_1_orig[i]['content'] = child1_str
+                messages_2_orig[i]['content'] = child2_str
+            else:
+                # We don't perform any crossover if there are not enough messages or the roles
+                # don't match
+                pass
+        return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
     def _deap_mutation(
             self,
             individual: "creator.Individual",
-            task_config: TaskConfig
-        ) -> Tuple["creator.Individual",]:
-        """Enhanced mutation operation with multiple strategies. Requires task_config for some mutations."""
-        prompt = str(individual)
+            initial_prompt: chat_prompt.ChatPrompt
+        ) -> "creator.Individual":
+        """Enhanced mutation operation with multiple strategies."""
+        prompt = chat_prompt.ChatPrompt(messages=individual)
         # Choose mutation strategy based on current diversity
         diversity = self._calculate_population_diversity()
@@ -294,23 +359,29 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
         if mutation_choice > structural_threshold:
             # This corresponds to the original 'else' (word_level_mutation)
-            return self._word_level_mutation(prompt)
+            mutated_prompt = self._word_level_mutation_prompt(prompt)
+            reporting.display_success("      Mutation successful, prompt has been edited by randomizing words (word-level mutation).", verbose=self.verbose)
+            return creator.Individual(mutated_prompt.formatted_messages)
         elif mutation_choice > semantic_threshold:
             # This corresponds to the original 'elif' (structural_mutation)
-            return self._structural_mutation(prompt)
+            mutated_prompt = self._structural_mutation(prompt)
+            reporting.display_success("      Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).", verbose=self.verbose)
+            return creator.Individual(mutated_prompt.formatted_messages)
         else:
             # This corresponds to the original 'if' (semantic_mutation)
-            return self._semantic_mutation(prompt, task_config)
+            mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
+            reporting.display_success("      Mutation successful, prompt has been edited using an LLM (semantic mutation).", verbose=self.verbose)
+            return creator.Individual(mutated_prompt.formatted_messages)
     def _semantic_mutation(
             self,
-            prompt: str,
-            task_config: TaskConfig
-        ) -> Tuple["creator.Individual",]:
+            prompt: chat_prompt.ChatPrompt,
+            initial_prompt: chat_prompt.ChatPrompt
+        ) -> chat_prompt.ChatPrompt:
         """Enhanced semantic mutation with multiple strategies."""
         current_output_style_guidance = self.output_style_guidance
         if random.random() < 0.1:
-            return self._radical_innovation_mutation(prompt, task_config)
+            return self._radical_innovation_mutation(prompt, initial_prompt)
         try:
             strategy = random.choice([
@@ -327,58 +398,79 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
             }
             user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
-Task context: {self._get_task_description_for_llm(task_config)}
+Task context: {self._get_task_description_for_llm(initial_prompt)}
 Desired output style from target LLM: '{current_output_style_guidance}'
 Instruction for this modification: {strategy_prompts[strategy]}.
-Return only the modified prompt string, nothing else.
+Return only the modified prompt message list, nothing else. Make sure to return a valid JSON object.
 """
             response = self._call_model(
-                prompt=user_prompt_for_semantic_mutation,
-                system_prompt=f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided.",
+                messages=[
+                    {"role": "system", "content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided."},
+                    {"role": "user", "content": user_prompt_for_semantic_mutation}
+                ],
                 is_reasoning=True
             )
-            return creator.Individual(response.strip()),
+            return chat_prompt.ChatPrompt(messages=utils.json_to_dict(response.strip()))
         except Exception as e:
-            logger.warning(f"Error in semantic mutation for prompt '{prompt[:50]}...': {e}")
-            return creator.Individual(prompt),
+            reporting.display_error(f"      Error in semantic mutation, this is usually a parsing error: {e}", verbose=self.verbose)
+            return prompt
     def _structural_mutation(
             self,
-            prompt: str
-        ) -> Tuple["creator.Individual",]:
+            prompt: chat_prompt.ChatPrompt
+        ) -> chat_prompt.ChatPrompt:
         """Perform structural mutation (reordering, combining, splitting)."""
-        sentences = [s.strip() for s in prompt.split('.') if s.strip()]
-        if len(sentences) <= 1:
-            return self._word_level_mutation(prompt)
-        mutation_type = random.random()
-        if mutation_type < 0.3:
-            # Reorder sentences
-            random.shuffle(sentences)
-            return creator.Individual('. '.join(sentences) + '.'),
-        elif mutation_type < 0.6:
-            # Combine adjacent sentences
-            if len(sentences) >= 2:
-                idx = random.randint(0, len(sentences) - 2)
-                combined = sentences[idx] + ' and ' + sentences[idx + 1]
-                sentences[idx:idx+2] = [combined]
-                return creator.Individual('. '.join(sentences) + '.'),
-        else:
-            # Split a sentence
-            idx = random.randint(0, len(sentences) - 1)
-            words = sentences[idx].split()
-            if len(words) > 3:
-                split_point = random.randint(2, len(words) - 2)
-                sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
-                return creator.Individual('. '.join(sentences) + '.'),
-        return creator.Individual(prompt),
+        mutated_messages: List[Dict[Literal["role", "content"], str]] = []
-    def _word_level_mutation(self, prompt: str) -> Tuple["creator.Individual",]:
+        for message in prompt.formatted_messages:
+            content = message["content"]
+            role = message["role"]
+            sentences = [s.strip() for s in content.split('.') if s.strip()]
+            if len(sentences) <= 1:
+                mutated_messages.append({"role": role, "content": self._word_level_mutation(content)})
+                continue
+            mutation_type = random.random()
+            if mutation_type < 0.3:
+                # Reorder sentences
+                random.shuffle(sentences)
+                mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
+                continue
+            elif mutation_type < 0.6:
+                # Combine adjacent sentences
+                if len(sentences) >= 2:
+                    idx = random.randint(0, len(sentences) - 2)
+                    combined = sentences[idx] + ' and ' + sentences[idx + 1]
+                    sentences[idx:idx+2] = [combined]
+                    mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
+                    continue
+            else:
+                # Split a sentence
+                idx = random.randint(0, len(sentences) - 1)
+                words = sentences[idx].split()
+                if len(words) > 3:
+                    split_point = random.randint(2, len(words) - 2)
+                    sentences[idx:idx+1] = [' '.join(words[:split_point]), ' '.join(words[split_point:])]
+                    mutated_messages.append({"role": role, "content": '. '.join(sentences) + '.'})
+                    continue
+                else:
+                    mutated_messages.append({"role": role, "content": content})
+        return chat_prompt.ChatPrompt(messages=mutated_messages)
+    def _word_level_mutation_prompt(self, prompt: chat_prompt.ChatPrompt) -> chat_prompt.ChatPrompt:
+        mutated_messages: List[Dict[Literal['role', 'content'], str]] = []
+        for message in prompt.formatted_messages:
+            mutated_messages.append({"role": message["role"], "content": self._word_level_mutation(message["content"])})
+        return chat_prompt.ChatPrompt(messages=mutated_messages)
+    def _word_level_mutation(self, msg_content: str) -> str:
         """Perform word-level mutation."""
-        words = prompt.split()
+        words = msg_content.split()
         if len(words) <= 1:
-            return creator.Individual(prompt),
+            return msg_content
         mutation_type = random.random()
         if mutation_type < 0.3:
@@ -395,7 +487,7 @@ Return only the modified prompt string, nothing else.
             idx = random.randint(0, len(words) - 1)
             words[idx] = self._modify_phrase(words[idx])
-        return creator.Individual(' '.join(words)),
+        return ' '.join(words)
     def _get_synonym(
             self,
@@ -404,8 +496,10 @@ Return only the modified prompt string, nothing else.
         """Get a synonym for a word using LLM."""
         try:
             response = self._call_model(
-                prompt=f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else.",
-                system_prompt="You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text.",
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text."},
+                    {"role": "user", "content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else."}
+                ],
                 is_reasoning=True
             )
             return response.strip()
@@ -420,8 +514,10 @@ Return only the modified prompt string, nothing else.
         """Modify a phrase while preserving meaning using LLM."""
         try:
             response = self._call_model(
-                prompt=f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else.",
-                system_prompt="You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text.",
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text."},
+                    {"role": "user", "content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else."}
+                ],
                 is_reasoning=True
             )
             return response.strip()
@@ -431,12 +527,12 @@ Return only the modified prompt string, nothing else.
     def _radical_innovation_mutation(
             self,
-            prompt_str: str,
-            task_config: TaskConfig
-        ) -> Tuple["creator.Individual",]:
+            prompt: chat_prompt.ChatPrompt,
+            initial_prompt: chat_prompt.ChatPrompt
+        ) -> chat_prompt.ChatPrompt:
         """Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
-        logger.debug(f"Attempting radical innovation for prompt: {prompt_str[:70]}...")
-        task_desc_for_llm = self._get_task_description_for_llm(task_config)
+        logger.debug(f"Attempting radical innovation for prompt: {json.dumps(prompt.formatted_messages)[:70]}...")
+        task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
         current_output_style_guidance = self.output_style_guidance
         user_prompt_for_radical_innovation = f"""Task Context:
@@ -444,225 +540,307 @@ Return only the modified prompt string, nothing else.
 Desired output style from target LLM: '{current_output_style_guidance}'
 Existing Prompt (which may be underperforming):
-'''{prompt_str}'''
+'''{prompt.formatted_messages}'''
 Please generate a new, significantly improved, and potentially very different prompt for this task.
 Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
-Return only the new prompt string.
+Return only the new prompt list object.
 """
         try:
             new_prompt_str = self._call_model(
-                prompt=user_prompt_for_radical_innovation,
-                system_prompt=self.get_radical_innovation_system_prompt(),
+                messages=[
+                    {"role": "system", "content": self._get_radical_innovation_system_prompt()},
+                    {"role": "user", "content": user_prompt_for_radical_innovation}
+                ],
                 is_reasoning=True
             )
-            logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {prompt_str[:70]}...")
-            return creator.Individual(new_prompt_str.strip()),
+            logger.info(f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.formatted_messages)[:70]}...")
+            return chat_prompt.ChatPrompt(messages=json.loads(new_prompt_str))
         except Exception as e:
-            logger.warning(f"Radical innovation mutation failed for prompt '{prompt_str[:50]}...': {e}. Returning original.")
-            return creator.Individual(prompt_str),
+            logger.warning(f"Radical innovation mutation failed for prompt '{json.dumps(prompt.formatted_messages)[:50]}...': {e}. Returning original.")
+            return prompt
     def _initialize_population(
         self,
-        initial_prompt: str,
-        task_config: TaskConfig,
-    ) -> List[str]:
+        prompt: chat_prompt.ChatPrompt
+    ) -> List[chat_prompt.ChatPrompt]:
         """Initialize the population with diverse variations of the initial prompt,
            including some 'fresh start' prompts based purely on task description.
            All generated prompts should aim to elicit answers matching self.output_style_guidance.
         """
-        population = [initial_prompt]
-        if self.population_size <= 1:
-            return population
+        with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
+            init_pop_report.start(self.population_size)
+            population = [prompt]
+            if self.population_size <= 1:
+                return population
+            num_to_generate_total = self.population_size - 1
+            num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
+            num_variations_on_initial = num_to_generate_total - num_fresh_starts
+            task_desc_for_llm = self._get_task_description_for_llm(prompt)
+            current_output_style_guidance = self.output_style_guidance
+            # Generate "fresh start" prompts if the initial prompt is not performing well
+            # Cold start prompts are generated from the task description
+            if num_fresh_starts > 0:
+                init_pop_report.start_fresh_prompts(num_fresh_starts)
+                fresh_start_user_prompt = f"""Here is a description of a task:
+    {task_desc_for_llm}
+    The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
+    Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
+    Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
+    Example of valid response: [
+        ["role": "<role>", "content": "<Prompt targeting specified style.>"],
+        ["role": "<role>", "content": "<Another prompt designed for the output style.>"]
+    ]
-        num_to_generate_total = self.population_size - 1
-        num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
-        num_variations_on_initial = num_to_generate_total - num_fresh_starts
+    Your response MUST be a valid JSON list of AI messages. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
+    """
+                try:
+                    response_content = self._call_model(
+                        messages=[
+                            {"role": "system", "content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings."},
+                            {"role": "user", "content": fresh_start_user_prompt}
+                        ],
+                        is_reasoning=True
+                    )
+                    logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
+                    fresh_prompts = utils.json_to_dict(response_content)
+                    if isinstance(fresh_prompts, list):
+                        if all(isinstance(p, dict) for p in fresh_prompts) and all(p.get("role") is not None for p in fresh_prompts):
+                            population.append(chat_prompt.ChatPrompt(messages=fresh_prompts))
+                            init_pop_report.success_fresh_prompts(1)
+                        elif all(isinstance(p, list) for p in fresh_prompts):
+                            population.extend([chat_prompt.ChatPrompt(messages=p) for p in fresh_prompts[:num_fresh_starts]])
+                            init_pop_report.success_fresh_prompts(len(fresh_prompts[:num_fresh_starts]))
+                        else:
+                            init_pop_report.failed_fresh_prompts(
+                                num_fresh_starts,
+                                f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts."
+                            )
+                except json.JSONDecodeError as e_json:
+                    init_pop_report.failed_fresh_prompts(
+                        num_fresh_starts,
+                        f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts."
+                    )
+                except Exception as e:
+                    init_pop_report.failed_fresh_prompts(
+                        num_fresh_starts,
+                        f"Error generating fresh start prompts: {e}. Skipping fresh start prompts."
+                    )
+            # Generate variations on the initial prompt for the remaining slots
+            # TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
+            if num_variations_on_initial > 0:
+                init_pop_report.start_variations(num_variations_on_initial)
+                # TODO: We need to split this into batches as the model will not return enough tokens
+                # to generate all the candidates
+                user_prompt_for_variation = f"""Initial prompt:
+    '''{prompt.formatted_messages}'''
+    Task context:
+    {task_desc_for_llm}
+    Desired output style from target LLM: '{current_output_style_guidance}'
+    Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
+    All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
+    For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
+    Return a JSON array of prompts with the following structure:
+    {{
+        "prompts": [
+            {{
+                "prompt": [{{"role": "<role>", "content": "<content>"}}],
+                "strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
+            }}
+            // ... more prompts if num_variations_on_initial > 1
+        ]
+    }}
+    Ensure a good mix of variations, all targeting the specified output style from the end LLM.
+    Return a valid JSON object that is correctly escaped. Return nothing else, d`o not include any additional text or Markdown formatting.
+    """
+                try:
+                    response_content_variations = self._call_model(
+                        messages=[
+                            {"role": "system", "content": self._get_reasoning_system_prompt_for_variation()},
+                            {"role": "user", "content": user_prompt_for_variation}
+                        ],
+                        is_reasoning=True
+                    )
+                    logger.debug(f"Raw response for population variations: {response_content_variations}")
+                    json_response_variations = json.loads(response_content_variations)
+                    generated_prompts_variations = [p["prompt"] for p in json_response_variations.get("prompts", []) if isinstance(p, dict) and "prompt" in p]
+                    if generated_prompts_variations:
+                        init_pop_report.success_variations(len(generated_prompts_variations[:num_variations_on_initial]))
+                        population.extend([chat_prompt.ChatPrompt(messages=p) for p in generated_prompts_variations[:num_variations_on_initial]])
+                    else:
+                        init_pop_report.failed_variations(num_variations_on_initial, "Could not parse 'prompts' list for variations. Skipping variations.")
+                except Exception as e:
+                    init_pop_report.failed_variations(num_variations_on_initial, f"Error calling LLM for initial population variations: {e}")
+            # Ensure population is of the required size using unique prompts
+            # TODO Test with levenshtein distance
+            final_population_set: Set[str] = set()
+            final_population_list: List[chat_prompt.ChatPrompt] = []
+            for p in population:
+                if json.dumps(p.formatted_messages) not in final_population_set:
+                    final_population_set.add(json.dumps(p.formatted_messages))
+                    final_population_list.append(p)
+            init_pop_report.end(final_population_list)
+            # Return exactly population_size prompts if possible, or fewer if generation failed badly.
+            return final_population_list[:self.population_size]
-        task_desc_for_llm = self._get_task_description_for_llm(task_config)
-        current_output_style_guidance = self.output_style_guidance
-        # Generate "fresh start" prompts if the initial prompt is not performing well
-        # Cold start prompts are generated from the task description
-        if num_fresh_starts > 0:
-            logger.info(f"Generating {num_fresh_starts} 'fresh start' prompts based on task description (aiming for style: '{current_output_style_guidance[:30]}...')...")
-            fresh_start_user_prompt = f"""Here is a description of a task:
-{task_desc_for_llm}
+    def _should_restart_population(self, curr_best: float) -> bool:
+        """
+        Update internal counters and decide if we should trigger
+        a population restart based on lack of improvement.
+        """
+        if self._best_primary_score_history:
+            threshold = self._best_primary_score_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD)
+            if curr_best < threshold:
+                self._gens_since_pop_improvement += 1
+            else:
+                self._gens_since_pop_improvement = 0
+        self._best_primary_score_history.append(curr_best)
+        return self._gens_since_pop_improvement >= self.DEFAULT_RESTART_GENERATIONS
-The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
+    def _restart_population(
+        self,
+        hof: tools.HallOfFame,
+        population: list["creator.Individual"],
+        best_prompt_so_far: chat_prompt.ChatPrompt,
+    ) -> list["creator.Individual"]:
+        """Return a fresh, evaluated population seeded by elites."""
+        if self.enable_moo:
+            elites = list(hof)
+        else:
+            elites = tools.selBest(population, self.elitism_size)
-Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
-Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
-Your response MUST be a valid JSON list of strings. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
-Example of valid response: ["Prompt targeting specified style.", "Another prompt designed for the output style."]
-"""
-            try:
-                response_content = self._call_model(
-                    prompt=fresh_start_user_prompt,
-                    system_prompt=f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
-                    is_reasoning=True
-                )
-                logger.debug(f"Raw LLM response for fresh start prompts: {response_content}")
-                cleaned_response_content = response_content.strip()
-                if cleaned_response_content.startswith("```json"):
-                    cleaned_response_content = cleaned_response_content[7:]
-                    if cleaned_response_content.endswith("```"):
-                        cleaned_response_content = cleaned_response_content[:-3]
-                elif cleaned_response_content.startswith("```"):
-                    cleaned_response_content = cleaned_response_content[3:]
-                    if cleaned_response_content.endswith("```"):
-                        cleaned_response_content = cleaned_response_content[:-3]
-                cleaned_response_content = cleaned_response_content.strip()
-                fresh_prompts = json.loads(cleaned_response_content)
-                if isinstance(fresh_prompts, list) and all(isinstance(p, str) for p in fresh_prompts) and fresh_prompts:
-                    population.extend(fresh_prompts[:num_fresh_starts])
-                    logger.info(f"Generated {len(fresh_prompts[:num_fresh_starts])} fresh prompts from LLM.")
-                else:
-                    logger.warning(f"LLM response for fresh starts was not a valid list of strings or was empty: {cleaned_response_content}. Using fallbacks for fresh starts.")
-                    population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
-            except json.JSONDecodeError as e_json:
-                logger.warning(f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response (after cleaning): '{cleaned_response_content if 'cleaned_response_content' in locals() else response_content}'. Using fallbacks for fresh starts.")
-                population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
-            except Exception as e:
-                logger.warning(f"Error generating fresh start prompts: {e}. Using fallbacks for fresh starts.")
-                population.extend(self._generate_fallback_variations(f"Fresh start targeting style: {current_output_style_guidance[:20]}", num_fresh_starts))
+        seed_prompt = (
+            chat_prompt.ChatPrompt(messages=max(elites, key=lambda x: x.fitness.values[0]))
+            if elites else best_prompt_so_far
+        )
-        # Generate variations on the initial prompt for the remaining slots
-        # TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
-        if num_variations_on_initial > 0:
-            logger.info(f"Generating {num_variations_on_initial} variations of the initial prompt (aiming for style: '{current_output_style_guidance[:30]}...')...")
-            user_prompt_for_variation = f"""Initial prompt:
-'''{initial_prompt}'''
+        prompt_variants = self._initialize_population(seed_prompt)
+        new_pop = [creator.Individual(p.formatted_messages) for p in prompt_variants]
-Task context:
-{task_desc_for_llm}
-Desired output style from target LLM: '{current_output_style_guidance}'
+        for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
+            ind.fitness.values = fit
-Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
-All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
-For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
+        self._gens_since_pop_improvement = 0
+        return new_pop
-Return a JSON array of prompts with the following structure:
-{{
-    "prompts": [
-        {{
-            "prompt": "alternative prompt 1 designed for the specified output style",
-            "strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
-        }}
-        // ... more prompts if num_variations_on_initial > 1
-    ]
-}}
-Ensure a good mix of variations, all targeting the specified output style from the end LLM.
-"""
-            try:
-                response_content_variations = self._call_model(
-                    prompt=user_prompt_for_variation,
-                    system_prompt=self.get_reasoning_system_prompt_for_variation(),
-                    is_reasoning=True
-                )
-                logger.debug(f"Raw response for population variations: {response_content_variations}")
-                json_response_variations = json.loads(response_content_variations)
-                generated_prompts_variations = [p["prompt"] for p in json_response_variations.get("prompts", []) if isinstance(p, dict) and "prompt" in p]
-                if generated_prompts_variations:
-                    population.extend(generated_prompts_variations[:num_variations_on_initial])
-                    logger.info(f"Successfully parsed {len(generated_prompts_variations[:num_variations_on_initial])} variations from LLM response.")
-                else:
-                    logger.warning("Could not parse 'prompts' list for variations. Using fallback for remaining.")
-                    population.extend(self._generate_fallback_variations(initial_prompt, num_variations_on_initial))
-            except Exception as e:
-                logger.error(f"Error calling LLM for initial population variations: {e}. Using fallback for remaining.")
-                population.extend(self._generate_fallback_variations(initial_prompt, num_variations_on_initial))
-        # Ensure population is of the required size using unique prompts
-        # TODO Test with levenshtein distance
-        final_population_set = set()
-        final_population_list = []
-        for p in population:
-            if p not in final_population_set:
-                final_population_set.add(p)
-                final_population_list.append(p)
-        # If not enough unique prompts, fill with fallbacks (could be more sophisticated)
-        while len(final_population_list) < self.population_size and len(final_population_list) < num_to_generate_total +1:
-            fallback_prompt = initial_prompt + f" #fallback{len(final_population_list)}"
-            if fallback_prompt not in final_population_set:
-                 final_population_list.append(fallback_prompt)
-                 final_population_set.add(fallback_prompt)
-            else:
-                # Safeguard if initial_prompt itself is causing issues with uniqueness
-                fallback_prompt = f"Fallback prompt variation {random.randint(1000,9999)}"
-                if fallback_prompt not in final_population_set:
-                    final_population_list.append(fallback_prompt)
-                    final_population_set.add(fallback_prompt)
-                # Avoid infinite loop in extreme edge case
-                else: break
-        logger.info(f"Initialized population with {len(final_population_list)} prompts.")
-        # Return exactly population_size prompts if possible, or fewer if generation failed badly.
-        return final_population_list[:self.population_size]
-    def _generate_diverse_variation(
-            self,
-            base_prompt: str,
-            seen_prompts: set
-        ) -> str:
-        """Generate a new variation that's different from existing ones."""
-        max_attempts = 5
-        for _ in range(max_attempts):
-            # Try different mutation strategies
-            mutation_choice = random.random()
-            if mutation_choice < 0.3:
-                new_prompt = self._semantic_mutation(base_prompt)[0]
-            elif mutation_choice < 0.6:
-                new_prompt = self._structural_mutation(base_prompt)[0]
-            else:
-                new_prompt = self._word_level_mutation(base_prompt)[0]
-            # Check if this variation is sufficiently different
-            is_diverse = True
-            for existing in seen_prompts:
-                if Levenshtein.distance(str(new_prompt), existing) / max(len(str(new_prompt)), len(existing)) < 0.3:
-                    is_diverse = False
-                    break
-            if is_diverse:
-                return str(new_prompt)
+    def _run_generation(
+        self,
+        generation_idx: int,
+        population: list["creator.Individual"],
+        prompt: chat_prompt.ChatPrompt,
+        hof: tools.HallOfFame,
+        report: Any,
+        best_primary_score_overall: float,
+    ) -> tuple[list["creator.Individual"], int]:
+        """Execute mating, mutation, evaluation and HoF update."""
+        best_gen_score = 0.0
+        # --- selection -------------------------------------------------
+        if self.enable_moo:
+            offspring = self.toolbox.select(population, self.population_size)
+        else:
+            elites = tools.selBest(population, self.elitism_size)
+            rest   = self.toolbox.select(population, len(population) - self.elitism_size)
+            offspring = elites + rest
+        # --- crossover -------------------------------------------------
+        report.performing_crossover()
+        offspring = list(map(self.toolbox.clone, offspring))
+        for i in range(0, len(offspring), 2):
+            if i+1 < len(offspring):
+                c1, c2 = offspring[i], offspring[i+1]
+                if random.random() < self.crossover_rate:
+                    c1_new, c2_new = self.toolbox.mate(c1, c2)
+                    offspring[i], offspring[i+1] = c1_new, c2_new
+                    del offspring[i].fitness.values, offspring[i+1].fitness.values
+        reporting.display_success("      Crossover successful, prompts have been combined and edited.\n│", verbose=self.verbose)
+        # --- mutation --------------------------------------------------
+        report.performing_mutation()
+        mut_rate = self._get_adaptive_mutation_rate()
+        n_mutations = 0
+        for i, ind in enumerate(offspring):
+            if random.random() < mut_rate:
+                new_ind = self.toolbox.mutate(ind, initial_prompt=prompt)
+                offspring[i] = new_ind
+                del offspring[i].fitness.values
+                n_mutations += 1
+        reporting.display_success(f"      Mutation successful, {n_mutations} prompts have been edited.\n│", verbose=self.verbose)
-        # If we couldn't generate a diverse variation, create a simple one
-        return base_prompt + f" #v{len(seen_prompts)}"
+        # --- evaluation ------------------------------------------------
+        invalid = [ind for ind in offspring if not ind.fitness.valid]
+        report.performing_evaluation(len(invalid))
+        for ind_idx, ind in enumerate(invalid):
+            fit = self.toolbox.evaluate(ind)
+            ind.fitness.values = fit
+            best_gen_score = max(best_gen_score, fit[0])
-    def _generate_fallback_variations(
-            self,
-            initial_prompt: str,
-              num_variations: int
-        ) -> List[str]:
-        """Generate fallback variations when LLM generation fails."""
-        variations = []
-        words = initial_prompt.split()
+            report.performed_evaluation(ind_idx, ind.fitness.values[0])
-        for i in range(num_variations):
-            if len(words) > 3:
-                # Shuffle words
-                shuffled = words.copy()
-                random.shuffle(shuffled)
-                variations.append(' '.join(shuffled))
-            else:
-                # Add simple variations
-                variations.append(initial_prompt + f" #v{i}")
+        # --- update HoF & reporter ------------------------------------
+        hof.update(offspring)
+        reporting.end_gen(generation_idx, best_gen_score, best_primary_score_overall, verbose=self.verbose)
-        return variations
+        return offspring, len(invalid)
+    def _population_best_score(self, population: List["creator.Individual"]) -> float:
+        """Return highest primary-objective score among *valid* individuals."""
+        valid_scores = [ind.fitness.values[0] for ind in population if ind.fitness.valid]
+        return max(valid_scores, default=0.0)
     def optimize_prompt(
         self,
-        dataset: Union[str, opik.Dataset],
-        metric_config: MetricConfig,
-        task_config: TaskConfig,
+        prompt: chat_prompt.ChatPrompt,
+        dataset: opik.Dataset,
+        metric: Callable,
         experiment_config: Optional[Dict] = None,
         n_samples: Optional[int] = None,
         auto_continue: bool = False,
         **kwargs,
     ) -> OptimizationResult:
+        """
+        Args:
+            prompt: The prompt to optimize
+            dataset: The dataset to use for evaluation
+            metric: Metric function to optimize with, should have the arguments `dataset_item` and `llm_output`
+            experiment_config: Optional experiment configuration
+            n_samples: Optional number of samples to use
+            auto_continue: Whether to automatically continue optimization
+            **kwargs: Additional keyword arguments
+        """
+        reporting.display_header(self.__class__.__name__, verbose=self.verbose)
+        reporting.display_configuration(
+            prompt.formatted_messages,
+            {
+                "optimizer": f"{ 'DEAP MOO' if self.enable_moo else 'DEAP SO' } Evolutionary Optimization",
+                "population_size": self.population_size,
+                "generations": self.num_generations,
+                "mutation_rate": self.mutation_rate,
+                "crossover_rate": self.crossover_rate,
+            },
+            verbose=self.verbose
+        )
         self.llm_call_counter = 0
         self._history = []
         self._current_optimization_id = None
@@ -672,145 +850,120 @@ Ensure a good mix of variations, all targeting the specified output style from t
         self._llm_cache.clear()
         self._current_population = []
         self._generations_without_overall_improvement = 0
-        # Determine final output_style_guidance
-        effective_output_style_guidance = self.output_style_guidance
-        if self.infer_output_style and \
-           (self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
-            # If user wants inference AND hasn't provided a specific custom guidance
-            inferred_style = self._infer_output_style_from_dataset(dataset, task_config)
-            if inferred_style:
-                effective_output_style_guidance = inferred_style
-                # Update self.output_style_guidance for this run so dynamic prompt methods use it
-                self.output_style_guidance = inferred_style
-            else:
-                logger.warning("Failed to infer output style, using default or user-provided guidance.")
-        # Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
-        # (It might have been None if user passed None and infer_output_style was False)
-        if self.output_style_guidance is None:
-            # Fallback if still None
-            self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
-        # The methods like get_reasoning_system_prompt_for_variation will now use the potentially updated self.output_style_guidance
-        log_prefix = "DEAP MOO" if self.enable_moo else "DEAP SO"
-        logger.info(f"Starting {log_prefix} Evolutionary Optimization for prompt: {task_config.instruction_prompt[:100]}...")
-        logger.info(f"Population: {self.population_size}, Generations: {self.num_generations}, Mutation: {self.mutation_rate}, Crossover: {self.crossover_rate}")
-        opik_dataset_obj: opik.Dataset
-        if isinstance(dataset, str):
-            opik_dataset_obj = self._opik_client.get_dataset(dataset)
-        else:
-            opik_dataset_obj = dataset
-        opik_optimization_run = None
-        try:
-            opik_optimization_run = self._opik_client.create_optimization(
-                dataset_name=opik_dataset_obj.name,
-                objective_name=metric_config.metric.name,
-                metadata={"optimizer": self.__class__.__name__},
-            )
-            self._current_optimization_id = opik_optimization_run.id
-            logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
-        except Exception as e:
-            logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
-        # Use of multi-objective fitness function or single-objective fitness function
+        # Step 0. Define fitness function
         if self.enable_moo:
             def _deap_evaluate_individual_fitness(
-                    individual_prompt_str: str
+                    messages: List[Dict[str, str]]
                 ) -> Tuple[float, float]:
-                primary_fitness_score = self.evaluate_prompt(
-                    dataset=opik_dataset_obj, metric_config=metric_config, task_config=task_config,
-                    prompt=str(individual_prompt_str), n_samples=n_samples,
+                primary_fitness_score: float = self.evaluate_prompt(
+                    prompt=chat_prompt.ChatPrompt(messages=messages),
+                    dataset=dataset,
+                    metric=metric,
+                    n_samples=n_samples,
                     experiment_config=(experiment_config or {}).copy(),
-                    optimization_id=self._current_optimization_id, verbose=0
+                    optimization_id=self._current_optimization_id,
+                    verbose=0
                 )
-                prompt_length = float(len(str(individual_prompt_str)))
-                logger.debug(f"Evaluated MOO individual '{str(individual_prompt_str)[:50]}...' -> Primary Score: {primary_fitness_score:.4f}, Length: {prompt_length}")
+                prompt_length = float(len(str(json.dumps(messages))))
                 return (primary_fitness_score, prompt_length)
         else:
             # Single-objective
             def _deap_evaluate_individual_fitness(
-                    individual_prompt_str: str
+                    messages: List[Dict[str, str]]
                 ) -> Tuple[float,]:
-                fitness_score = self.evaluate_prompt(
-                    dataset=opik_dataset_obj, metric_config=metric_config, task_config=task_config,
-                    prompt=str(individual_prompt_str), n_samples=n_samples,
+                fitness_score: float = self.evaluate_prompt(
+                    prompt=chat_prompt.ChatPrompt(messages=messages),
+                    dataset=dataset,
+                    metric=metric,
+                    n_samples=n_samples,
                     experiment_config=(experiment_config or {}).copy(),
-                    optimization_id=self._current_optimization_id, verbose=0
+                    optimization_id=self._current_optimization_id,
+                    verbose=0
                 )
-                logger.debug(f"Evaluated SO individual '{str(individual_prompt_str)[:50]}...' -> Score: {fitness_score:.4f}")
                 return (fitness_score,)
-        # Register the fitness function with DEAP
         self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
-        initial_prompt_strings = self._initialize_population(
-            initial_prompt=task_config.instruction_prompt, task_config=task_config
-        )
-        deap_population = [creator.Individual(p_str) for p_str in initial_prompt_strings]
-        deap_population = deap_population[:self.population_size]
+        # Step 1. Start Opik optimization run
+        opik_optimization_run: Optional[optimization.Optimization] = None
+        try:
+            opik_optimization_run: optimization.Optimization = self._opik_client.create_optimization(
+                dataset_name=dataset.name,
+                objective_name=metric.__name__,
+                metadata={"optimizer": self.__class__.__name__},
+            )
+            self._current_optimization_id = opik_optimization_run.id
+            logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
+        except Exception as e:
+            logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
-        initial_eval_result = _deap_evaluate_individual_fitness(task_config.instruction_prompt)
-        initial_primary_score = initial_eval_result[0]
-        initial_length = initial_eval_result[1] if self.enable_moo else float(len(task_config.instruction_prompt))
+        # Step 2. Compute the initial performance of the prompt
+        with reporting.baseline_performance(verbose=self.verbose) as report_baseline_performance:
+            initial_eval_result: Tuple[float, float] | Tuple[float, ] = _deap_evaluate_individual_fitness(prompt.formatted_messages)
+            initial_primary_score: float = initial_eval_result[0]
+            initial_length: float = initial_eval_result[1] if self.enable_moo else float(len(json.dumps(prompt.formatted_messages)))
+            best_primary_score_overall: float = initial_primary_score
+            best_prompt_overall = prompt
+            report_baseline_performance.set_score(initial_primary_score)
-        best_primary_score_overall = initial_primary_score
-        best_prompt_overall = task_config.instruction_prompt
-        if self.enable_moo:
-            logger.info(f"Initial prompt '{task_config.instruction_prompt[:100]}...' -> Primary Score: {initial_primary_score:.4f}, Length: {initial_length}")
-        else:
-            logger.info(f"Initial prompt '{task_config.instruction_prompt[:100]}...' score: {initial_primary_score:.4f}")
+        # Step 3. Define the output style guide
+        effective_output_style_guidance = self.output_style_guidance
+        if self.infer_output_style and \
+        (self.output_style_guidance is None or self.output_style_guidance == self.DEFAULT_OUTPUT_STYLE_GUIDANCE):
+            # If user wants inference AND hasn't provided a specific custom guidance
+            inferred_style = self._infer_output_style_from_dataset(dataset, prompt)
+            if inferred_style:
+                effective_output_style_guidance = inferred_style
+                # Update self.output_style_guidance for this run so dynamic prompt methods use it
+                self.output_style_guidance = inferred_style
+            else:
+                logger.warning("Failed to infer output style, using default or user-provided guidance.")
-        # Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
+        # Ensure self.output_style_guidance is set to the effective one for the rest of the methods for this run
+        # (It might have been None if user passed None and infer_output_style was False)
+        if self.output_style_guidance is None:
+            # Fallback if still None
+            self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
+        # Step 4. Initialize population
+        initial_prompts: List[chat_prompt.ChatPrompt] = self._initialize_population(
+            prompt=prompt
+        )
+        deap_population = [creator.Individual(p.formatted_messages) for p in initial_prompts]
+        deap_population = deap_population[:self.population_size]
+        # Step 5. Initialize the hall of fame (Pareto front for MOO) and stats for MOO or SO
         if self.enable_moo:
             hof = tools.ParetoFront()
-            stats_primary = tools.Statistics(lambda ind: ind.fitness.values[0])
-            stats_length = tools.Statistics(lambda ind: ind.fitness.values[1])
-            stats_primary.register("avg_score", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
-            stats_primary.register("max_score", max)
-            stats_length.register("avg_len", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
-            stats_length.register("min_len", min)
-            mstats = tools.MultiStatistics(score=stats_primary, length=stats_length)
-            logbook_header_stats = mstats.fields
         else:
             # Single-objective
             hof = tools.HallOfFame(self.DEFAULT_HALL_OF_FAME_SIZE)
-            stats = tools.Statistics(lambda ind: ind.fitness.values[0])
-            stats.register("avg", lambda x: sum(x) / len(x) if len(x) > 0 else 0)
-            stats.register("std", lambda x: (sum((xi - (sum(x) / len(x) if len(x) > 0 else 0))**2 for xi in x) / len(x))**0.5 if len(x) > 1 else 0)
-            stats.register("min", min)
-            stats.register("max", max)
-            logbook_header_stats = stats.fields
-        logbook = tools.Logbook()
-        logbook.header = ["gen", "evals"] + logbook_header_stats
-        # Evaluate the initial population
-        fitnesses = list(map(self.toolbox.evaluate, deap_population))
-        for ind, fit in zip(deap_population, fitnesses):
-            ind.fitness.values = fit
+        # Step 6. Evaluate the initial population
+        with reporting.evaluate_initial_population(verbose=self.verbose) as report_initial_population:
+            fitnesses: List[float] = list(map(self.toolbox.evaluate, deap_population))
+            _best_score = max(best_primary_score_overall, max([x[0] for x in fitnesses]))
+            for i, ind, fit in zip(range(len(deap_population)), deap_population, fitnesses):
+                ind.fitness.values = fit
+                report_initial_population.set_score(i, fit[0], _best_score)
         hof.update(deap_population)
-        record_stats = mstats if self.enable_moo else stats
-        record = record_stats.compile(deap_population) if record_stats else {}
-        logbook.record(gen=0, evals=len(deap_population), **record)
-        if self.verbose >= 1:
-            print(logbook.stream)
         if hof and len(hof) > 0:
             if self.enable_moo:
-                current_best_for_primary = max(hof, key=lambda ind: ind.fitness.values[0])
-                best_primary_score_overall = current_best_for_primary.fitness.values[0]
-                best_prompt_overall = str(current_best_for_primary)
+                current_best_for_primary: creator.Individual = max(hof, key=lambda ind: ind.fitness.values[0])
+                best_primary_score_overall: float = current_best_for_primary.fitness.values[0]
+                best_prompt_overall = chat_prompt.ChatPrompt(messages=current_best_for_primary)
             else:
                 # Single-objective
                 current_best_on_front = hof[0]
-                best_primary_score_overall = current_best_on_front.fitness.values[0]
+                best_primary_score_overall: float = current_best_on_front.fitness.values[0]
             if self.enable_moo:
-                logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {best_prompt_overall[:100]}...")
+                logger.info(f"Gen {0}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {json.dumps(best_prompt_overall.formatted_messages)[:100]}...")
             else:
                 logger.info(f"Gen {0}: New best score: {best_primary_score_overall:.4f}")
@@ -826,132 +979,65 @@ Ensure a good mix of variations, all targeting the specified output style from t
             ).dict()
             self._add_to_history(initial_round_data)
-        pbar_desc = f"{log_prefix} Evolutionary Optimization"
-        pbar_postfix_key = "best_primary_score" if self.enable_moo else "best_score"
-        pbar = tqdm(
-            total=self.num_generations,
-            desc=pbar_desc,
-            unit="gen",
-            disable=self.verbose < 1,
-            postfix={pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter}
-        )
+        with reporting.start_evolutionary_algo(verbose=self.verbose) as report_evolutionary_algo:
+            for generation_idx in range(1, self.num_generations + 1):
+                report_evolutionary_algo.start_gen(generation_idx, self.num_generations)
-        gen = 0
-        for gen_idx in range(1, self.num_generations + 1):
-            gen = gen_idx
-            self._current_generation = gen
-            pbar.set_postfix({pbar_postfix_key: f"{best_primary_score_overall:.4f}", "llm_calls": self.llm_call_counter})
-            previous_best_primary_score_for_gen = best_primary_score_overall
-            # Population restart logic
-            current_pop_best_primary = 0.0
-            if deap_population and deap_population[0].fitness.valid:
-                current_pop_best_primary = max(ind.fitness.values[0] for ind in deap_population if ind.fitness.valid)
-            if self._best_fitness_history and current_pop_best_primary < self._best_fitness_history[-1] * (1 + self.DEFAULT_RESTART_THRESHOLD):
-                 self._generations_without_improvement += 1
-            else:
-                 self._generations_without_improvement = 0
-            self._best_fitness_history.append(current_pop_best_primary)
+                curr_best_score = self._population_best_score(deap_population)
-            if self._generations_without_improvement >= self.DEFAULT_RESTART_GENERATIONS:
-                logger.info(f"Detected stagnation in primary objective at gen {gen}. Restarting population...")
-                elites_for_restart = list(hof) if self.enable_moo else list(tools.selBest(deap_population, self.elitism_size))
-                seed_prompt_for_restart = str(max(elites_for_restart, key=lambda ind: ind.fitness.values[0])) if elites_for_restart else best_prompt_overall
-                new_population_strings = self._initialize_population(initial_prompt=seed_prompt_for_restart, task_config=task_config)
-                deap_population = [creator.Individual(p_str) for p_str in new_population_strings]
-                self._generations_without_improvement = 0
-                fitnesses_new = list(map(self.toolbox.evaluate, deap_population))
-                for ind, fit in zip(deap_population, fitnesses_new):
-                    ind.fitness.values = fit
-                # Offspring will be selected from this new population in the next step
-            # Standard DEAP evolutionary algorithm steps
-            if self.enable_moo:
-                # NSGA-II is used for MOO
-                offspring = self.toolbox.select(deap_population, self.population_size)
-            else:
-                # Single-objective: Elitism + Selection
-                elites = tools.selBest(deap_population, self.elitism_size)
-                selected_offspring = self.toolbox.select(deap_population, len(deap_population) - self.elitism_size)
-                offspring = elites + selected_offspring
-            # Set up the offspring for the next generation
-            offspring = list(map(self.toolbox.clone, offspring))
-            for child1, child2 in zip(offspring[::2], offspring[1::2]):
-                if random.random() < self.crossover_rate:
-                    self.toolbox.mate(child1, child2)
-                    del child1.fitness.values
-                    del child2.fitness.values
-            # Mutate the offspring
-            current_mutation_rate = self._get_adaptive_mutation_rate()
-            for mutant in offspring:
-                if random.random() < current_mutation_rate:
-                    self.toolbox.mutate(mutant, task_config=task_config)
-                    del mutant.fitness.values
-            # Evaluate the offspring
-            invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
-            fitnesses_eval = map(self.toolbox.evaluate, invalid_ind)
-            for ind, fit in zip(invalid_ind, fitnesses_eval):
-                ind.fitness.values = fit
-            # Update the hall of fame
-            hof.update(offspring)
-            deap_population[:] = offspring # Replace population
+                # ---------- restart logic -------------------------------------
+                if self._should_restart_population(curr_best_score):
+                    report_evolutionary_algo.restart_population(self.DEFAULT_RESTART_GENERATIONS)
+                    deap_population = self._restart_population(
+                        hof, deap_population, best_prompt_overall
+                    )
-            # Update overall best score and prompt (based on primary objective for consistency)
-            if hof and len(hof) > 0:
-                if self.enable_moo:
-                    current_best_on_front = max(hof, key=lambda ind: ind.fitness.values[0])
-                    updated_best_primary_score = current_best_on_front.fitness.values[0]
-                else:
-                    # Single-objective
-                    current_best_on_front = hof[0]
-                    updated_best_primary_score = current_best_on_front.fitness.values[0]
-                if updated_best_primary_score > best_primary_score_overall:
-                    best_primary_score_overall = updated_best_primary_score
-                    best_prompt_overall = str(current_best_on_front)
-                    logger.info(f"Gen {gen}: New best primary score: {best_primary_score_overall:.4f}, Prompt: {best_prompt_overall[:100]}...")
-                    self._generations_without_overall_improvement = 0
-                elif updated_best_primary_score == previous_best_primary_score_for_gen:
-                    # Check against score at start of this gen's logic
-                    self._generations_without_overall_improvement += 1
+                # ---------- run one generation --------------------------------
+                deap_population, invalid_count = self._run_generation(
+                    generation_idx, deap_population, prompt, hof, report_evolutionary_algo, best_primary_score_overall
+                )
+                # -------- update best-prompt bookkeeping -------------------------
+                previous_best_primary_score_for_gen = best_primary_score_overall
+                if hof:
+                    if self.enable_moo:
+                        current_best_ind = max(hof, key=lambda ind: ind.fitness.values[0])
+                    else:
+                        current_best_ind = hof[0]
+                    updated_best_primary_score = current_best_ind.fitness.values[0]
+                    if updated_best_primary_score > best_primary_score_overall:
+                        best_primary_score_overall = updated_best_primary_score
+                        self._generations_without_overall_improvement = 0
+                    elif updated_best_primary_score == previous_best_primary_score_for_gen:
+                        self._generations_without_overall_improvement += 1
+                    else:
+                        self._generations_without_overall_improvement += 1
                 else:
-                    # Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
                     self._generations_without_overall_improvement += 1
-            else:
-                # Score might have decreased or HOF is empty (less likely for SO HOF with size > 0)
-                self._generations_without_overall_improvement += 1
-            record = record_stats.compile(deap_population) if record_stats else {}
-            logbook.record(gen=gen, evals=len(invalid_ind), **record)
-            if self.verbose >= 1:
-                print(logbook.stream)
-            # History logging for this transition
-            # FIXME: Use model.dump() instead of dict()
-            gen_round_data = OptimizationRound(
-                round_number=gen,
-                current_prompt=best_prompt_overall, # Representative best
-                current_score=best_primary_score_overall,
-                generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
-                best_prompt=best_prompt_overall,
-                best_score=best_primary_score_overall,
-                improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
-            ).dict()
-            self._add_to_history(gen_round_data)
-            pbar.update(1)
-            if self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS:
-                logger.info(f"Overall best score has not improved for {self.DEFAULT_EARLY_STOPPING_GENERATIONS} generations. Stopping early at gen {gen}.")
-                break
+                # ---------- early-stopping check ------------------------------
+                if self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS:
+                    logger.info(
+                        "No overall improvement for %d generations – early stopping at gen %d.",
+                        self.DEFAULT_EARLY_STOPPING_GENERATIONS,
+                        generation_idx,
+                    )
+                    break
+                # History logging for this transition
+                # FIXME: Use model.dump() instead of dict()
+                gen_round_data = OptimizationRound(
+                    round_number=generation_idx,
+                    current_prompt=best_prompt_overall, # Representative best
+                    current_score=best_primary_score_overall,
+                    generated_prompts=[{"prompt": str(ind), "score": ind.fitness.values[0]} for ind in deap_population if ind.fitness.valid],
+                    best_prompt=best_prompt_overall,
+                    best_score=best_primary_score_overall,
+                    improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
+                ).dict()
+                self._add_to_history(gen_round_data)
-        pbar.close()
-        logger.info(f"\n{log_prefix} Evolutionary Optimization finished after {gen} generations.")
         stopped_early_flag = self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
         final_details = {}
         initial_score_for_display = initial_primary_score
@@ -963,12 +1049,12 @@ Ensure a good mix of variations, all targeting the specified output style from t
                 for i, sol in enumerate(sorted_hof):
                     final_results_log += f"  Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
                 best_overall_solution = sorted_hof[0]
-                final_best_prompt = str(best_overall_solution)
+                final_best_prompt = chat_prompt.ChatPrompt(messages=best_overall_solution)
                 final_primary_score = best_overall_solution.fitness.values[0]
                 final_length = best_overall_solution.fitness.values[1]
                 logger.info(final_results_log)
                 logger.info(f"Representative best prompt (highest primary score from Pareto front): '{final_best_prompt}'")
-                logger.info(f"  Primary Score ({metric_config.metric.name}): {final_primary_score:.4f}")
+                logger.info(f"  Primary Score ({metric.__name__}): {final_primary_score:.4f}")
                 logger.info(f"  Length: {final_length:.0f}")
                 final_details.update({
                     "initial_primary_score": initial_primary_score,
@@ -986,7 +1072,7 @@ Ensure a good mix of variations, all targeting the specified output style from t
                 logger.warning("MOO: ParetoFront is empty. Reporting last known best.")
                 final_best_prompt = best_prompt_overall
                 final_primary_score = best_primary_score_overall
-                final_length = float(len(final_best_prompt))
+                final_length = float(len(json.dumps(final_best_prompt.formatted_messages)))
                 final_details.update({"initial_primary_score": initial_primary_score, "initial_length": initial_length,
                                       "final_prompt_representative": final_best_prompt, "final_primary_score_representative": final_primary_score,
                                       "final_length_representative": final_length, "pareto_front_solutions": []})
@@ -995,9 +1081,9 @@ Ensure a good mix of variations, all targeting the specified output style from t
             final_best_prompt = best_prompt_overall
             final_primary_score = best_primary_score_overall
             logger.info(f"Final best prompt from Hall of Fame: '{final_best_prompt}'")
-            logger.info(f"Final best score ({metric_config.metric.name}): {final_primary_score:.4f}")
+            logger.info(f"Final best score ({metric.__name__}): {final_primary_score:.4f}")
             final_details.update({
-                "initial_prompt": task_config.instruction_prompt,
+                "initial_prompt": prompt.formatted_messages,
                 "initial_score": initial_primary_score,
                 "initial_score_for_display": initial_primary_score,
                 "final_prompt": final_best_prompt,
@@ -1014,15 +1100,13 @@ Ensure a good mix of variations, all targeting the specified output style from t
         # Add final details
         final_details.update({
-            "total_generations_run": gen,
+            "total_generations_run": generation_idx + 1,
             "population_size": self.population_size,
             "mutation_probability": self.mutation_rate,
             "crossover_probability": self.crossover_rate,
             "elitism_size": self.elitism_size if not self.enable_moo else "N/A (MOO uses NSGA-II)",
             "adaptive_mutation": self.adaptive_mutation,
-            "deap_logbook": logbook.stream if logbook else "Not available",
-            "task_config": task_config.dict(),
-            "metric_config": metric_config.dict(),
+            "metric_name": metric.__name__,
             "model": self.model,
             "moo_enabled": self.enable_moo,
             "llm_crossover_enabled": self.enable_llm_crossover,
@@ -1039,11 +1123,17 @@ Ensure a good mix of variations, all targeting the specified output style from t
         })
         # Return the OptimizationResult
+        reporting.display_result(
+            initial_score=initial_score_for_display,
+            best_score=final_primary_score,
+            best_prompt=final_best_prompt.formatted_messages,
+            verbose=self.verbose
+        )
         return OptimizationResult(
             optimizer=self.__class__.__name__,
-            prompt=final_best_prompt,
+            prompt=final_best_prompt.formatted_messages,
             score=final_primary_score,
-            metric_name=metric_config.metric.name,
+            metric_name=metric.__name__,
             details=final_details,
             history=self.get_history(),
             llm_calls=self.llm_call_counter
@@ -1052,8 +1142,7 @@ Ensure a good mix of variations, all targeting the specified output style from t
     @_throttle.rate_limited(_rate_limiter)
     def _call_model(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
+        messages: List[Dict[str, str]],
         is_reasoning: bool = False,
         optimization_id: Optional[str] = None,
     ) -> str:
@@ -1084,11 +1173,6 @@ Ensure a good mix of variations, all targeting the specified output style from t
             if metadata_for_opik:
                 llm_config_params["metadata"] = metadata_for_opik
-            messages = []
-            if system_prompt:
-                messages.append({"role": "system", "content": system_prompt})
-            messages.append({"role": "user", "content": prompt})
             # Pass llm_config_params to the Opik monitor
             final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
                 llm_config_params.copy()
@@ -1102,14 +1186,16 @@ Ensure a good mix of variations, all targeting the specified output style from t
             response = litellm.completion(
                 model=self.model, messages=messages, **final_call_params
             )
+            logger.debug(f"Response: {response}")
             return response.choices[0].message.content
-        except litellm.exceptions.RateLimitError as e:
+        except litellm_exceptions.RateLimitError as e:
             logger.error(f"LiteLLM Rate Limit Error: {e}")
             raise
-        except litellm.exceptions.APIConnectionError as e:
+        except litellm_exceptions.APIConnectionError as e:
             logger.error(f"LiteLLM API Connection Error: {e}")
             raise
-        except litellm.exceptions.ContextWindowExceededError as e:
+        except litellm_exceptions.ContextWindowExceededError as e:
             logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
             raise
         except Exception as e:
@@ -1118,10 +1204,9 @@ Ensure a good mix of variations, all targeting the specified output style from t
     def evaluate_prompt(
         self,
-        dataset: Union[str, opik.Dataset],
-        metric_config: MetricConfig,
-        task_config: TaskConfig,
-        prompt: str,
+        prompt: chat_prompt.ChatPrompt,
+        dataset: opik.Dataset,
+        metric: Callable,
         n_samples: Optional[int] = None,
         dataset_item_ids: Optional[List[str]] = None,
         experiment_config: Optional[Dict] = None,
@@ -1130,57 +1215,32 @@ Ensure a good mix of variations, all targeting the specified output style from t
     ) -> float:
         """
         Evaluate a single prompt (individual) against the dataset.
-        Adapted from MetaPromptOptimizer._evaluate_prompt.
+        Args:
+            prompt: The prompt to evaluate
+            dataset: The dataset to use for evaluation
+            metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
+            n_samples: Optional number of samples to use
+            dataset_item_ids: Optional list of dataset item IDs to use
+            experiment_config: Optional experiment configuration
+            optimization_id: Optional optimization ID
+            verbose: Controls internal logging/progress bars (0=off, 1=on).
+        Returns:
+            float: The metric value
         """
-        effective_verbose = self.verbose if verbose == 0 else verbose
-        if isinstance(dataset, str):
-            # This should ideally be done once in optimize_prompt if dataset is a string
-            # but if called standalone, we need to handle it.
-            # TODO Move to base class
-            opik_eval_dataset = self._opik_client.get_dataset(dataset)
-        else:
-            opik_eval_dataset = dataset
-        total_items = len(opik_eval_dataset.get_items())
+        total_items = len(dataset.get_items())
-        # Determine subset_size for this evaluation run
-        # TODO Move to dataset utils
-        if dataset_item_ids:
-            subset_size = len(dataset_item_ids)
-            logger.debug(f"Using provided {subset_size} dataset_item_ids for evaluation.")
-        elif n_samples is not None:
-            if n_samples > total_items:
-                logger.warning(
-                    f"Requested n_samples ({n_samples}) for individual evaluation is larger than dataset size ({total_items}). Using full dataset."
-                )
-                subset_size = None
-            elif n_samples <= 0:
-                logger.warning(
-                    f"Requested n_samples ({n_samples}) is <=0. Using full dataset for this evaluation."
-                )
-                subset_size = None
-            else:
-                subset_size = n_samples
-                logger.debug(f"Using specified n_samples: {subset_size} items for this evaluation run.")
-        else:
-            # Default behavior if no n_samples and no dataset_item_ids are given for this specific call
-            # This case should be rare if n_samples is passed down from optimize_prompt
-            subset_size = min(total_items, min(20, max(10, int(total_items * 0.2))))
-            logger.debug(
-                f"Using automatic subset size for this evaluation: {subset_size} items (20% of {total_items} total items)"
-            )
         current_experiment_config = experiment_config or {}
         current_experiment_config = {
             **current_experiment_config,
             **{
                 "optimizer": self.__class__.__name__,
-                "metric": metric_config.metric.name,
-                "dataset": opik_eval_dataset.name,
+                "metric": metric.__name__,
+                "dataset": dataset.name,
                 "configuration": {
-                    "prompt_evaluated": prompt,
-                    "n_samples_for_eval": subset_size if dataset_item_ids is None else len(dataset_item_ids),
+                    "prompt": prompt.formatted_messages,
+                    "n_samples_for_eval": len(dataset_item_ids) if dataset_item_ids is not None else n_samples,
                     "total_dataset_items": total_items,
                 },
             },
@@ -1189,81 +1249,35 @@ Ensure a good mix of variations, all targeting the specified output style from t
         def llm_task(
                 dataset_item: Dict[str, Any]
             ) -> Dict[str, str]:
-            if hasattr(dataset_item, "to_dict"):
-                dataset_item = dataset_item.to_dict()
-            for input_key in task_config.input_dataset_fields:
-                if input_key not in dataset_item:
-                    raise ValueError(f"Input field '{input_key}' not found in dataset sample: {dataset_item}")
-            if task_config.output_dataset_field not in dataset_item:
-                raise ValueError(f"Output field '{task_config.output_dataset_field}' not found in dataset sample: {dataset_item}")
-            prompt_for_llm: str
-            field_mapping = {
-                field: dataset_item[field]
-                for field in task_config.input_dataset_fields
-                if field in dataset_item
-            }
-            if getattr(task_config, "use_chat_prompt", False):
-                candidate_template = Template(prompt)
-                user_content_parts = []
-                for field_name in task_config.input_dataset_fields:
-                    if field_name in dataset_item:
-                        user_content_parts.append(f"{field_name.capitalize()}: {dataset_item[field_name]}")
-                user_content = "\n".join(user_content_parts)
-                raw_model_output = self._call_model(
-                    prompt=user_content,
-                    system_prompt=prompt,
-                    is_reasoning=False
-                )
-            else:
-                input_clauses = []
-                for field_name in task_config.input_dataset_fields:
-                    if field_name in dataset_item:
-                        input_clauses.append(
-                            f"{field_name.capitalize()}: {dataset_item[field_name]}"
-                        )
-                item_specific_inputs_str = "\n".join(input_clauses)
-                prompt_for_llm = f"{prompt}\n\n{item_specific_inputs_str}"
-                raw_model_output = self._call_model(
-                    prompt=prompt_for_llm,
-                    system_prompt=None,
-                    is_reasoning=False
-                )
+            try:
+                messages = [{
+                    "role": item["role"],
+                    "content": item["content"].format(**dataset_item)
+                } for item in prompt.formatted_messages]
+            except Exception as e:
+                logger.warning(f"Error in llm_task, this is usually a parsing error: {e}")
+                return {mappers.EVALUATED_LLM_TASK_OUTPUT: ""}
-            cleaned_model_output = raw_model_output.strip()
-            output_field = task_config.output_dataset_field
-            prefixes_to_strip = [f"{output_field.capitalize()}:", f"{output_field}:", "Answer:"]
-            for prefix in prefixes_to_strip:
-                if cleaned_model_output.lower().startswith(prefix.lower()):
-                    cleaned_model_output = cleaned_model_output[len(prefix):].strip()
-                    break
+            model_output = self._call_model(
+                messages=messages,
+                is_reasoning=False
+            )
-            return {mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output}
-        logger.debug(
-            f"Starting evaluation for a prompt with {subset_size if subset_size else 'all'} samples (or specific IDs) for metric: {metric_config.metric.name}"
-        )
+            return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
         # Evaluate the prompt
         score = task_evaluator.evaluate(
-            dataset=opik_eval_dataset,
+            dataset=dataset,
             dataset_item_ids=dataset_item_ids,
-            metric_config=metric_config,
+            metric=metric,
             evaluated_task=llm_task,
             num_threads=self.num_threads,
             project_name=self.project_name,
-            n_samples=subset_size if dataset_item_ids is None else None,
+            n_samples=n_samples if dataset_item_ids is None else None,
             experiment_config=current_experiment_config,
             optimization_id=optimization_id,
-            # FIXME: Hack for verbose till its merged
-            #verbose=effective_verbose,
+            verbose=verbose
         )
-        logger.debug(f"Evaluation score for prompt: {score:.4f}")
         return score
     def _llm_deap_crossover(
@@ -1272,42 +1286,48 @@ Ensure a good mix of variations, all targeting the specified output style from t
             ind2: "creator.Individual"
         ) -> Tuple["creator.Individual", "creator.Individual"]:
         """Perform crossover by asking an LLM to blend two parent prompts."""
-        parent1_str = str(ind1)
-        parent2_str = str(ind2)
+        reporting.display_message("      Recombining prompts using an LLM.", verbose=self.verbose)
+        parent1_messages: List[Dict[Literal["role", "content"], str]] = ind1
+        parent2_messages: List[Dict[Literal["role", "content"], str]] = ind2
         current_output_style_guidance = self.output_style_guidance
         user_prompt_for_llm_crossover = f"""Parent Prompt 1:
-'''{parent1_str}'''
+'''{parent1_messages}'''
 Parent Prompt 2:
-'''{parent2_str}'''
+'''{parent2_messages}'''
 Desired output style from target LLM for children prompts: '{current_output_style_guidance}'
-Please generate one or two child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
-Follow the instructions provided in the system prompt regarding the JSON output format ({{"children_prompts": ["child1", ... ]}}).
+Please generate TWO child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
+Follow the instructions provided in the system prompt regarding the JSON output format:
+[
+    [{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_1
+    [{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_2
+]
 """
         try:
-            logger.debug(f"Attempting LLM-driven crossover between: '{parent1_str[:50]}...' and '{parent2_str[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'")
+            logger.debug(f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'")
             response_content = self._call_model(
-                prompt=user_prompt_for_llm_crossover,
-                system_prompt=self.get_llm_crossover_system_prompt(),
+                messages=[
+                    {"role": "system", "content": self.get_llm_crossover_system_prompt()},
+                    {"role": "user", "content": user_prompt_for_llm_crossover},
+                ],
                 is_reasoning=True
             )
             logger.debug(f"Raw LLM response for crossover: {response_content}")
-            json_response = json.loads(response_content)
-            children_strings = json_response.get("children_prompts", [])
-            if not children_strings or not isinstance(children_strings, list) or not all(isinstance(cs, str) for cs in children_strings):
+            json_response = utils.json_to_dict(response_content)
+            if not isinstance(json_response, list) or len(json_response) != 2 or not all(isinstance(cs, list) for cs in json_response):
                 logger.warning("LLM Crossover: Malformed or empty children_prompts list. Falling back.")
                 raise ValueError("Malformed LLM crossover response")
-            child1_str = children_strings[0]
-            child2_str = children_strings[1] if len(children_strings) > 1 else self._deap_mutation(creator.Individual(parent2_str), task_config=None)[0] # task_config might not be available or needed here for simple mutation
+            child1: List[Dict[Literal["role", "content"], str]] = json_response[0]
+            child2: List[Dict[Literal["role", "content"], str]] = json_response[1]
-            logger.debug(f"LLM Crossover generated child1: {child1_str[:50]}... Child2: {child2_str[:50]}...")
-            return creator.Individual(child1_str), creator.Individual(str(child2_str))
+            logger.debug(f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}...")
+            return creator.Individual(child1), creator.Individual(child2)
         except Exception as e:
             logger.warning(f"LLM-driven crossover failed: {e}. Falling back to standard crossover.")
@@ -1315,17 +1335,15 @@ Follow the instructions provided in the system prompt regarding the JSON output
     def _get_task_description_for_llm(
             self,
-            task_config: TaskConfig
+            prompt: chat_prompt.ChatPrompt
         ) -> str:
         """Generates a concise task description for use in LLM prompts for fresh generation or radical innovation."""
-        input_fields_str = ", ".join(task_config.input_dataset_fields)
-        output_field_str = task_config.output_dataset_field
-        description = f"Task: Given input(s) from field(s) '{input_fields_str}', generate a response for the field '{output_field_str}'. "
-        description += f"The original high-level instruction being optimized is: '{task_config.instruction_prompt}'. "
+        description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
+        description += f"The original high-level instruction being optimized is: '{prompt.formatted_messages}'. "
         description += "The goal is to create an effective prompt that guides a language model to perform this task well."
         return description
-    def get_reasoning_system_prompt_for_variation(self) -> str:
+    def _get_reasoning_system_prompt_for_variation(self) -> str:
         return f"""You are an expert prompt engineer specializing in creating diverse and effective prompts. Given an initial prompt, your task is to generate a diverse set of alternative prompts.
 For each prompt variation, consider:
@@ -1367,13 +1385,18 @@ Consider the following when generating children:
 - You can create a child that is a direct blend, or one that takes a primary structure from one parent and incorporates specific elements from the other, always optimizing for clear instruction towards the desired output style.
 - If generating two children, try to make them distinct from each other and from the parents, perhaps by emphasizing different aspects of the parental combination that could lead to the desired output style.
-Return a JSON object with a single key "children_prompts", which is a list of strings. Each string is a child prompt.
-Example for one child: {{"children_prompts": ["child prompt 1 designed for specified style"]}}
-Example for two children: {{"children_prompts": ["child prompt 1 for target style", "child prompt 2 also for target style"]}}
-Generate at least one child, and at most two. All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
+All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
+Return a JSON object that is a list of both child prompts. Each child prompt is a list of LLM messages. Example:
+[
+    [{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}],
+    [{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}]
+]
 """
-    def get_radical_innovation_system_prompt(self) -> str:
+    def _get_radical_innovation_system_prompt(self) -> str:
         return f"""You are an expert prompt engineer and a creative problem solver.
 Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
 Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
@@ -1384,66 +1407,59 @@ Return only the new prompt string, with no preamble or explanation.
     def _infer_output_style_from_dataset(
             self,
             dataset: opik.Dataset,
-            task_config: TaskConfig,
+            prompt: chat_prompt.ChatPrompt,
             n_examples: int = 5
         ) -> Optional[str]:
         """Analyzes dataset examples to infer the desired output style."""
-        logger.info(f"Attempting to infer output style from up to {n_examples} dataset examples...")
-        try:
-            all_items = dataset.get_items()
-        except Exception as e:
-            logger.error(f"Failed to get items from dataset '{dataset.name}': {e}")
-            return None
-        if not all_items:
-            logger.warning(f"Dataset '{dataset.name}' is empty. Cannot infer output style.")
-            return None
-        # Take the first n_examples
-        items_to_process = all_items[:n_examples]
-        # Need at least a couple of examples for meaningful inference
-        if len(items_to_process) < min(n_examples, 2):
-            logger.warning(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
-            return None
-        examples_str = ""
-        for i, item_obj in enumerate(items_to_process):
-            item_content = item_obj.content if hasattr(item_obj, 'content') else item_obj
-            if not isinstance(item_content, dict):
-                logger.warning(f"Dataset item {i} does not have a .content dictionary or is not a dict itself. Skipping item: {item_obj}")
-                continue
+        with reporting.infer_output_style(verbose=self.verbose) as report_infer_output_style:
+            report_infer_output_style.start_style_inference(n_examples)
+            try:
+                items_to_process = dataset.get_items(n_examples)
+            except Exception as e:
+                report_infer_output_style.error(f"Failed to get items from dataset '{dataset.name}': {e}")
+                return None
+            if not items_to_process:
+                report_infer_output_style.error(f"Dataset '{dataset.name}' is empty. Cannot infer output style.")
+                return None
-            input_parts = []
-            for field in task_config.input_dataset_fields:
-                if field in item_content:
-                    input_parts.append(f"{field.capitalize()}: {item_content[field]}")
-            input_str = "\n".join(input_parts)
-            output_str = item_content.get(task_config.output_dataset_field, "[NO OUTPUT FIELD FOUND]")
-            examples_str += f"Example {i+1}:\nInput(s):\n{input_str}\nOutput: {output_str}\n---\n"
+            # Need at least a couple of examples for meaningful inference
+            if len(items_to_process) < min(n_examples, 2):
+                report_infer_output_style.error(f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}.")
+                return None
-        user_prompt_for_style_inference = f"""Please analyze the following input-output examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. This description will be used to guide other LLMs in generating and refining prompts.
+            examples_str = ""
+            for i, item_content in enumerate(items_to_process):
+                filtered_content = {x: y for x, y in item_content.items() if x != "id"}
+                examples_str += f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
-{examples_str}
+            user_prompt_for_style_inference = f"""Please analyze the following examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. Before describing the output style, make sure to understand the dataset content and structure as it can include input, output and metadata fields. This description will be used to guide other LLMs in generating and refining prompts.
-Based on these examples, what is the desired output style description?
-Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
-The description should be a single string that can be directly used as an instruction for another LLM.
-Return ONLY this descriptive string.
-"""
-        try:
-            inferred_style = self._call_model(
-                prompt=user_prompt_for_style_inference,
-                system_prompt=self._INFER_STYLE_SYSTEM_PROMPT,
-                is_reasoning=True
-            )
-            inferred_style = inferred_style.strip()
-            if inferred_style:
-                logger.info(f"Inferred output style: '{inferred_style}'")
-                return inferred_style
-            else:
-                logger.warning("LLM returned empty string for inferred output style.")
+    {examples_str}
+    Based on these examples, what is the desired output style description?
+    Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
+    The description should be a single string that can be directly used as an instruction for another LLM.
+    Return ONLY this descriptive string.
+    """
+            #report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
+            try:
+                inferred_style = self._call_model(
+                    messages=[
+                        {"role": "system", "content": self._INFER_STYLE_SYSTEM_PROMPT},
+                        {"role": "user", "content": user_prompt_for_style_inference}
+                    ],
+                    is_reasoning=True
+                )
+                inferred_style = inferred_style.strip()
+                if inferred_style:
+                    report_infer_output_style.success(inferred_style)
+                    return inferred_style
+                else:
+                    report_infer_output_style.error("LLM returned empty string for inferred output style.")
+                    return None
+            except Exception as e:
+                report_infer_output_style.error(f"Error during output style inference: {e}")
                 return None
-        except Exception as e:
-            logger.error(f"Error during output style inference: {e}")
-            return None

opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

opik-optimizer 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl