opik-optimizer 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. opik_optimizer/__init__.py +0 -2
  2. opik_optimizer/base_optimizer.py +314 -145
  3. opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
  4. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
  5. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
  6. opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
  9. opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
  10. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
  11. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
  12. opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
  13. opik_optimizer/gepa_optimizer/reporting.py +164 -22
  14. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
  15. opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
  16. opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
  17. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
  18. opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
  19. opik_optimizer/mipro_optimizer/__init__.py +2 -2
  20. opik_optimizer/mipro_optimizer/_lm.py +4 -4
  21. opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
  22. opik_optimizer/mipro_optimizer/utils.py +1 -0
  23. opik_optimizer/optimizable_agent.py +7 -4
  24. opik_optimizer/optimization_config/chat_prompt.py +7 -10
  25. opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
  26. opik_optimizer/parameter_optimizer/reporting.py +148 -0
  27. opik_optimizer/reporting_utils.py +42 -15
  28. opik_optimizer/utils/core.py +16 -2
  29. opik_optimizer/utils/prompt_segments.py +1 -2
  30. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
  31. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +34 -35
  32. opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
  33. opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
  34. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
  35. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
  36. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ from .. import task_evaluator
6
6
  from ..optimization_config import mappers, chat_prompt
7
7
  from ..mcp_utils.mcp_workflow import MCPExecutionConfig
8
8
  import opik
9
+ from opik import opik_context
9
10
  import copy
10
11
 
11
12
  if TYPE_CHECKING: # pragma: no cover - typing only
@@ -15,7 +16,7 @@ if TYPE_CHECKING: # pragma: no cover - typing only
15
16
  class EvaluationOps:
16
17
  if TYPE_CHECKING:
17
18
  agent_class: type[Any]
18
- num_threads: int
19
+ n_threads: int
19
20
 
20
21
  def _evaluate_prompt(
21
22
  self,
@@ -79,6 +80,16 @@ class EvaluationOps:
79
80
 
80
81
  if mcp_execution_config is None:
81
82
  model_output = agent.invoke(messages)
83
+
84
+ # Add tags to trace for optimization tracking
85
+ if (
86
+ hasattr(self, "current_optimization_id")
87
+ and self.current_optimization_id
88
+ ):
89
+ opik_context.update_current_trace(
90
+ tags=[self.current_optimization_id, "Evaluation"]
91
+ )
92
+
82
93
  return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
83
94
 
84
95
  coordinator = mcp_execution_config.coordinator
@@ -119,6 +130,15 @@ class EvaluationOps:
119
130
  else:
120
131
  final_response = raw_model_output
121
132
 
133
+ # Add tags to trace for optimization tracking
134
+ if (
135
+ hasattr(self, "current_optimization_id")
136
+ and self.current_optimization_id
137
+ ):
138
+ opik_context.update_current_trace(
139
+ tags=[self.current_optimization_id, "Evaluation"]
140
+ )
141
+
122
142
  return {mappers.EVALUATED_LLM_TASK_OUTPUT: final_response.strip()}
123
143
 
124
144
  score = task_evaluator.evaluate(
@@ -126,8 +146,8 @@ class EvaluationOps:
126
146
  dataset_item_ids=dataset_item_ids,
127
147
  metric=metric,
128
148
  evaluated_task=llm_task,
129
- num_threads=self.num_threads,
130
- project_name=experiment_config.get("project_name"),
149
+ num_threads=self.n_threads,
150
+ project_name=optimizer.project_name,
131
151
  n_samples=n_samples if dataset_item_ids is None else None,
132
152
  experiment_config=experiment_config,
133
153
  optimization_id=optimization_id,
@@ -31,7 +31,6 @@ from opik_optimizer.utils.prompt_segments import extract_prompt_segments
31
31
  from .mcp import EvolutionaryMCPContext, finalize_mcp_result
32
32
 
33
33
  from . import reporting
34
- from .llm_support import LlmSupport
35
34
  from .mutation_ops import MutationOps
36
35
  from .crossover_ops import CrossoverOps
37
36
  from .population_ops import PopulationOps
@@ -48,19 +47,41 @@ creator = cast(Any, _creator) # type: ignore[assignment]
48
47
 
49
48
  class EvolutionaryOptimizer(BaseOptimizer):
50
49
  """
51
- The Evolutionary Optimizer can be used to optimize prompts using a 4 stage genetic algorithm
52
- approach:
50
+ Evolutionary Optimizer that uses genetic algorithms to evolve and improve prompts over generations.
53
51
 
54
- 1. Generate a set of candidate prompts based on variations of the best prompts (exploitation) as
55
- well as completely new prompts (exploration)
56
- 2. Evaluate the candidate prompts
57
- 3. Select the best prompts
58
- 4. Repeat until convergence
52
+ This optimizer uses a 4-stage genetic algorithm approach:
53
+
54
+ 1. Generate candidate prompts through variations of the best prompts (exploitation) and
55
+ completely new prompts (exploration)
56
+ 2. Evaluate the candidate prompts on the dataset
57
+ 3. Select the best prompts based on fitness
58
+ 4. Repeat until convergence or max generations reached
59
59
 
60
60
  This algorithm is best used if you have a first draft prompt and would like to find a better
61
- prompt.
61
+ prompt through iterative evolution. It supports both single-objective and multi-objective
62
+ optimization (balancing performance and prompt length).
62
63
 
63
64
  Note: This algorithm is time consuming and can be expensive to run.
65
+
66
+ Args:
67
+ model: LiteLLM model name for optimizer's internal operations (mutations, crossover, etc.)
68
+ model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
69
+ Common params: temperature, max_tokens, max_completion_tokens, top_p.
70
+ See: https://docs.litellm.ai/docs/completion/input
71
+ population_size: Number of prompts in the population
72
+ num_generations: Number of generations to run
73
+ mutation_rate: Mutation rate for genetic operations
74
+ crossover_rate: Crossover rate for genetic operations
75
+ tournament_size: Tournament size for selection
76
+ elitism_size: Number of elite prompts to preserve across generations
77
+ adaptive_mutation: Whether to use adaptive mutation that adjusts based on population diversity
78
+ enable_moo: Whether to enable multi-objective optimization (optimizes metric and prompt length)
79
+ enable_llm_crossover: Whether to enable LLM-based crossover operations
80
+ output_style_guidance: Optional guidance for output style in generated prompts
81
+ infer_output_style: Whether to automatically infer output style from the dataset
82
+ n_threads: Number of threads for parallel evaluation
83
+ verbose: Controls internal logging/progress bars (0=off, 1=on)
84
+ seed: Random seed for reproducibility
64
85
  """
65
86
 
66
87
  DEFAULT_POPULATION_SIZE = 30
@@ -98,43 +119,23 @@ class EvolutionaryOptimizer(BaseOptimizer):
98
119
 
99
120
  def __init__(
100
121
  self,
101
- model: str,
122
+ model: str = "gpt-4o",
123
+ model_parameters: dict[str, Any] | None = None,
102
124
  population_size: int = DEFAULT_POPULATION_SIZE,
103
125
  num_generations: int = DEFAULT_NUM_GENERATIONS,
104
126
  mutation_rate: float = DEFAULT_MUTATION_RATE,
105
127
  crossover_rate: float = DEFAULT_CROSSOVER_RATE,
106
128
  tournament_size: int = DEFAULT_TOURNAMENT_SIZE,
107
- num_threads: int | None = None,
108
129
  elitism_size: int = DEFAULT_ELITISM_SIZE,
109
130
  adaptive_mutation: bool = DEFAULT_ADAPTIVE_MUTATION,
110
131
  enable_moo: bool = DEFAULT_ENABLE_MOO,
111
132
  enable_llm_crossover: bool = DEFAULT_ENABLE_LLM_CROSSOVER,
112
- seed: int | None = DEFAULT_SEED,
113
133
  output_style_guidance: str | None = None,
114
134
  infer_output_style: bool = False,
115
- verbose: int = 1,
116
135
  n_threads: int = DEFAULT_NUM_THREADS,
117
- **model_kwargs: Any,
136
+ verbose: int = 1,
137
+ seed: int = DEFAULT_SEED,
118
138
  ) -> None:
119
- """
120
- Args:
121
- model: The model to use for evaluation
122
- population_size: Number of prompts in the population
123
- num_generations: Number of generations to run
124
- mutation_rate: Mutation rate for genetic operations
125
- crossover_rate: Crossover rate for genetic operations
126
- tournament_size: Tournament size for selection
127
- n_threads: Number of threads for parallel evaluation
128
- elitism_size: Number of elitism prompts
129
- adaptive_mutation: Whether to use adaptive mutation
130
- enable_moo: Whether to enable multi-objective optimization - When enable optimizes for both the supplied metric and the length of the prompt
131
- enable_llm_crossover: Whether to enable LLM crossover
132
- seed: Random seed for reproducibility
133
- output_style_guidance: Output style guidance for prompts
134
- infer_output_style: Whether to infer output style
135
- verbose: Controls internal logging/progress bars (0=off, 1=on).
136
- **model_kwargs: Additional model parameters
137
- """
138
139
  # Initialize base class first
139
140
  if sys.version_info >= (3, 13):
140
141
  warnings.warn(
@@ -142,42 +143,27 @@ class EvolutionaryOptimizer(BaseOptimizer):
142
143
  "You may see asyncio teardown warnings. Prefer Python 3.12.",
143
144
  RuntimeWarning,
144
145
  )
145
- if "project_name" in model_kwargs:
146
- warnings.warn(
147
- "The 'project_name' parameter in optimizer constructor is deprecated. "
148
- "Set project_name in the ChatPrompt instead.",
149
- DeprecationWarning,
150
- stacklevel=2,
151
- )
152
- del model_kwargs["project_name"]
153
146
 
154
- super().__init__(model=model, verbose=verbose, **model_kwargs)
147
+ super().__init__(
148
+ model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
149
+ )
155
150
  self.population_size = population_size
156
151
  self.num_generations = num_generations
157
152
  self.mutation_rate = mutation_rate
158
153
  self.crossover_rate = crossover_rate
159
154
  self.tournament_size = tournament_size
160
- if num_threads is not None:
161
- warnings.warn(
162
- "The 'num_threads' parameter is deprecated and will be removed in a future version. "
163
- "Use 'n_threads' instead.",
164
- DeprecationWarning,
165
- stacklevel=2,
166
- )
167
- n_threads = num_threads
168
- self.num_threads = n_threads
155
+ self.n_threads = n_threads
169
156
  self.elitism_size = elitism_size
170
157
  self.adaptive_mutation = adaptive_mutation
171
158
  self.enable_moo = enable_moo
172
159
  self.enable_llm_crossover = enable_llm_crossover
173
- self.seed = seed if seed is not None else self.DEFAULT_SEED
160
+ self.seed = seed
174
161
  self.output_style_guidance = (
175
162
  output_style_guidance
176
163
  if output_style_guidance is not None
177
164
  else self.DEFAULT_OUTPUT_STYLE_GUIDANCE
178
165
  )
179
166
  self.infer_output_style = infer_output_style
180
- self._current_optimization_id: str | None = None
181
167
  self._current_generation = 0
182
168
  self._best_fitness_history: list[float] = []
183
169
  self._generations_without_improvement = 0
@@ -249,8 +235,8 @@ class EvolutionaryOptimizer(BaseOptimizer):
249
235
  func = getattr(cls, name)
250
236
  setattr(self, name, func.__get__(self, self.__class__))
251
237
 
252
- # LLM calls
253
- bind(LlmSupport, ["_call_model"])
238
+ # LLM calls - now inherited from BaseOptimizer
239
+ # bind(LlmSupport, ["_call_model"]) # Removed - using BaseOptimizer._call_model
254
240
 
255
241
  # Mutations
256
242
  bind(
@@ -318,6 +304,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
318
304
  ) -> Any:
319
305
  individual = creator.Individual(prompt_candidate.get_messages())
320
306
  setattr(individual, "tools", copy.deepcopy(prompt_candidate.tools))
307
+ setattr(individual, "function_map", prompt_candidate.function_map)
321
308
  return individual
322
309
 
323
310
  def _update_individual_with_prompt(
@@ -325,6 +312,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
325
312
  ) -> Any:
326
313
  individual[:] = prompt_candidate.get_messages()
327
314
  setattr(individual, "tools", copy.deepcopy(prompt_candidate.tools))
315
+ setattr(individual, "function_map", prompt_candidate.function_map)
328
316
  return individual
329
317
 
330
318
  def _get_adaptive_mutation_rate(self) -> float:
@@ -415,13 +403,17 @@ class EvolutionaryOptimizer(BaseOptimizer):
415
403
  else:
416
404
  elites = tools.selBest(population, self.elitism_size)
417
405
 
418
- seed_prompt = (
419
- chat_prompt.ChatPrompt(
420
- messages=max(elites, key=lambda x: x.fitness.values[0])
406
+ if elites:
407
+ best_elite = max(elites, key=lambda x: x.fitness.values[0])
408
+ seed_prompt = chat_prompt.ChatPrompt(
409
+ messages=best_elite,
410
+ tools=getattr(best_elite, "tools", best_prompt_so_far.tools),
411
+ function_map=getattr(
412
+ best_elite, "function_map", best_prompt_so_far.function_map
413
+ ),
421
414
  )
422
- if elites
423
- else best_prompt_so_far
424
- )
415
+ else:
416
+ seed_prompt = best_prompt_so_far
425
417
 
426
418
  prompt_variants = self._initialize_population(seed_prompt)
427
419
  new_pop = [self._create_individual_from_prompt(p) for p in prompt_variants]
@@ -522,6 +514,10 @@ class EvolutionaryOptimizer(BaseOptimizer):
522
514
  n_samples: int | None = None,
523
515
  auto_continue: bool = False,
524
516
  agent_class: type[OptimizableAgent] | None = None,
517
+ project_name: str = "Optimization",
518
+ max_trials: int = 10,
519
+ mcp_config: MCPExecutionConfig | None = None,
520
+ *args: Any,
525
521
  **kwargs: Any,
526
522
  ) -> OptimizationResult:
527
523
  """
@@ -533,21 +529,18 @@ class EvolutionaryOptimizer(BaseOptimizer):
533
529
  n_samples: Optional number of samples to use
534
530
  auto_continue: Whether to automatically continue optimization
535
531
  agent_class: Optional agent class to use
536
- **kwargs: Additional keyword arguments including:
537
- mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
532
+ project_name: Opik project name for logging traces (default: "Optimization")
533
+ mcp_config: MCP tool calling configuration (default: None)
538
534
  """
539
535
  # Use base class validation and setup methods
540
- self.validate_optimization_inputs(prompt, dataset, metric)
541
- self.configure_prompt_model(prompt)
542
- self.agent_class = self.setup_agent_class(prompt, agent_class)
543
-
544
- # Extract MCP config from kwargs (for optional MCP workflows)
545
- mcp_config = kwargs.pop("mcp_config", None)
536
+ self._validate_optimization_inputs(prompt, dataset, metric)
537
+ self.agent_class = self._setup_agent_class(prompt, agent_class)
546
538
  evaluation_kwargs: dict[str, Any] = {}
547
539
  if mcp_config is not None:
548
540
  evaluation_kwargs["mcp_config"] = mcp_config
549
541
 
550
- self.project_name = self.agent_class.project_name
542
+ # Set project name from parameter
543
+ self.project_name = project_name
551
544
 
552
545
  # Step 0. Start Opik optimization run
553
546
  opik_optimization_run: optimization.Optimization | None = None
@@ -557,14 +550,14 @@ class EvolutionaryOptimizer(BaseOptimizer):
557
550
  objective_name=metric.__name__,
558
551
  metadata={"optimizer": self.__class__.__name__},
559
552
  )
560
- self._current_optimization_id = opik_optimization_run.id
553
+ self.current_optimization_id = opik_optimization_run.id
561
554
  except Exception as e:
562
555
  logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
563
- self._current_optimization_id = None
556
+ self.current_optimization_id = None
564
557
 
565
558
  reporting.display_header(
566
559
  algorithm=self.__class__.__name__,
567
- optimization_id=self._current_optimization_id,
560
+ optimization_id=self.current_optimization_id,
568
561
  dataset_id=dataset.id,
569
562
  verbose=self.verbose,
570
563
  )
@@ -583,7 +576,8 @@ class EvolutionaryOptimizer(BaseOptimizer):
583
576
  )
584
577
 
585
578
  # Step 1. Step variables and define fitness function
586
- self.reset_counters() # Reset counters for run
579
+ self._reset_counters() # Reset counters for run
580
+ trials_used = [0] # Use list for closure mutability
587
581
  self._history: list[OptimizationRound] = []
588
582
  self._current_generation = 0
589
583
  self._best_fitness_history = []
@@ -595,7 +589,16 @@ class EvolutionaryOptimizer(BaseOptimizer):
595
589
 
596
590
  def _deap_evaluate_individual_fitness(
597
591
  messages: list[dict[str, str]],
598
- ) -> tuple[float, float]:
592
+ ) -> tuple[float, ...]:
593
+ # Check if we've hit the limit
594
+ if trials_used[0] >= max_trials:
595
+ logger.debug(
596
+ f"Skipping evaluation - max_trials ({max_trials}) reached"
597
+ )
598
+ return (-float("inf"), float("inf")) # Worst possible fitness
599
+
600
+ trials_used[0] += 1
601
+
599
602
  primary_fitness_score: float = self._evaluate_prompt(
600
603
  prompt,
601
604
  messages, # type: ignore
@@ -603,7 +606,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
603
606
  metric=metric,
604
607
  n_samples=n_samples,
605
608
  experiment_config=(experiment_config or {}).copy(),
606
- optimization_id=self._current_optimization_id,
609
+ optimization_id=self.current_optimization_id,
607
610
  verbose=0,
608
611
  **evaluation_kwargs,
609
612
  )
@@ -614,7 +617,16 @@ class EvolutionaryOptimizer(BaseOptimizer):
614
617
  # Single-objective
615
618
  def _deap_evaluate_individual_fitness(
616
619
  messages: list[dict[str, str]],
617
- ) -> tuple[float, float]:
620
+ ) -> tuple[float, ...]:
621
+ # Check if we've hit the limit
622
+ if trials_used[0] >= max_trials:
623
+ logger.debug(
624
+ f"Skipping evaluation - max_trials ({max_trials}) reached"
625
+ )
626
+ return (-float("inf"),) # Worst possible fitness
627
+
628
+ trials_used[0] += 1
629
+
618
630
  fitness_score: float = self._evaluate_prompt(
619
631
  prompt,
620
632
  messages, # type: ignore
@@ -622,11 +634,11 @@ class EvolutionaryOptimizer(BaseOptimizer):
622
634
  metric=metric,
623
635
  n_samples=n_samples,
624
636
  experiment_config=(experiment_config or {}).copy(),
625
- optimization_id=self._current_optimization_id,
637
+ optimization_id=self.current_optimization_id,
626
638
  verbose=0,
627
639
  **evaluation_kwargs,
628
640
  )
629
- return (fitness_score, 0.0)
641
+ return (fitness_score,)
630
642
 
631
643
  self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
632
644
 
@@ -644,6 +656,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
644
656
  else float(len(json.dumps(prompt.get_messages())))
645
657
  )
646
658
 
659
+ trials_used[0] = 0
647
660
  best_primary_score_overall = initial_primary_score
648
661
  best_prompt_overall = prompt
649
662
  report_baseline_performance.set_score(initial_primary_score)
@@ -715,14 +728,22 @@ class EvolutionaryOptimizer(BaseOptimizer):
715
728
  )
716
729
  best_primary_score_overall = current_best_for_primary.fitness.values[0]
717
730
  best_prompt_overall = chat_prompt.ChatPrompt(
718
- messages=current_best_for_primary
731
+ messages=current_best_for_primary,
732
+ tools=getattr(current_best_for_primary, "tools", prompt.tools),
733
+ function_map=getattr(
734
+ current_best_for_primary, "function_map", prompt.function_map
735
+ ),
719
736
  )
720
737
  else:
721
738
  # Single-objective
722
739
  current_best_on_front = hof[0]
723
740
  best_primary_score_overall = current_best_on_front.fitness.values[0]
724
741
  best_prompt_overall = chat_prompt.ChatPrompt(
725
- messages=current_best_on_front
742
+ messages=current_best_on_front,
743
+ tools=getattr(current_best_on_front, "tools", prompt.tools),
744
+ function_map=getattr(
745
+ current_best_on_front, "function_map", prompt.function_map
746
+ ),
726
747
  )
727
748
 
728
749
  if self.enable_moo:
@@ -756,6 +777,13 @@ class EvolutionaryOptimizer(BaseOptimizer):
756
777
  verbose=self.verbose
757
778
  ) as report_evolutionary_algo:
758
779
  for generation_idx in range(1, self.num_generations + 1):
780
+ # Check if we've exhausted our evaluation budget
781
+ if trials_used[0] >= max_trials:
782
+ logger.info(
783
+ f"Stopping optimization: max_trials ({max_trials}) reached after {generation_idx - 1} generations"
784
+ )
785
+ break
786
+
759
787
  report_evolutionary_algo.start_gen(generation_idx, self.num_generations)
760
788
 
761
789
  curr_best_score = self._population_best_score(deap_population)
@@ -854,7 +882,11 @@ class EvolutionaryOptimizer(BaseOptimizer):
854
882
  final_results_log += f" Solution {i + 1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
855
883
  best_overall_solution = sorted_hof[0]
856
884
  final_best_prompt = chat_prompt.ChatPrompt(
857
- messages=best_overall_solution
885
+ messages=best_overall_solution,
886
+ tools=getattr(best_overall_solution, "tools", prompt.tools),
887
+ function_map=getattr(
888
+ best_overall_solution, "function_map", prompt.function_map
889
+ ),
858
890
  )
859
891
  final_primary_score = best_overall_solution.fitness.values[0]
860
892
  final_length = best_overall_solution.fitness.values[1]
@@ -922,11 +954,12 @@ class EvolutionaryOptimizer(BaseOptimizer):
922
954
  )
923
955
 
924
956
  logger.info(f"Total LLM calls during optimization: {self.llm_call_counter}")
957
+ logger.info(f"Total prompt evaluations: {trials_used[0]}")
925
958
  if opik_optimization_run:
926
959
  try:
927
960
  opik_optimization_run.update(status="completed")
928
961
  logger.info(
929
- f"Opik Optimization run {self._current_optimization_id} status updated to completed."
962
+ f"Opik Optimization run {self.current_optimization_id} status updated to completed."
930
963
  )
931
964
  except Exception as e:
932
965
  logger.warning(f"Failed to update Opik Optimization run status: {e}")
@@ -952,13 +985,14 @@ class EvolutionaryOptimizer(BaseOptimizer):
952
985
  "seed": self.seed,
953
986
  "prompt_type": "single_string_ga",
954
987
  "initial_score_for_display": initial_score_for_display,
955
- "temperature": self.model_kwargs.get("temperature"),
988
+ "temperature": self.model_parameters.get("temperature"),
956
989
  "stopped_early": stopped_early_flag,
957
990
  "rounds": self.get_history(),
958
991
  "user_output_style_guidance": self.output_style_guidance,
959
992
  "infer_output_style_requested": self.infer_output_style,
960
993
  "final_effective_output_style_guidance": effective_output_style_guidance,
961
994
  "infer_output_style": self.infer_output_style,
995
+ "trials_used": trials_used[0],
962
996
  }
963
997
  )
964
998
 
@@ -974,14 +1008,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
974
1008
  final_tools = getattr(final_best_prompt, "tools", None)
975
1009
  if final_tools:
976
1010
  final_details["final_tools"] = final_tools
977
- tool_prompts = {
978
- (tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
979
- "function", {}
980
- ).get("description")
981
- for idx, tool in enumerate(final_tools)
982
- }
983
- else:
984
- tool_prompts = None
1011
+ tool_prompts = self._extract_tool_prompts(final_tools)
985
1012
 
986
1013
  return OptimizationResult(
987
1014
  optimizer=self.__class__.__name__,
@@ -995,7 +1022,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
995
1022
  llm_calls=self.llm_call_counter,
996
1023
  tool_calls=self.tool_call_counter,
997
1024
  dataset_id=dataset.id,
998
- optimization_id=self._current_optimization_id,
1025
+ optimization_id=self.current_optimization_id,
999
1026
  tool_prompts=tool_prompts,
1000
1027
  )
1001
1028
 
@@ -1090,7 +1117,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
1090
1117
  self._mcp_context = previous_context
1091
1118
  self.enable_llm_crossover = previous_crossover
1092
1119
 
1093
- finalize_mcp_result(result, context, panel_style)
1120
+ finalize_mcp_result(result, context, panel_style, optimizer=self)
1094
1121
  return result
1095
1122
 
1096
1123
  # Evaluation is provided by EvaluationOps
@@ -1102,7 +1129,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
1102
1129
  def _get_reasoning_system_prompt_for_variation(self) -> str:
1103
1130
  return evo_prompts.variation_system_prompt(self.output_style_guidance)
1104
1131
 
1105
- def get_llm_crossover_system_prompt(self) -> str:
1132
+ def _get_llm_crossover_system_prompt(self) -> str:
1106
1133
  return evo_prompts.llm_crossover_system_prompt(self.output_style_guidance)
1107
1134
 
1108
1135
  def _get_radical_innovation_system_prompt(self) -> str:
@@ -168,16 +168,21 @@ def finalize_mcp_result(
168
168
  result: Any,
169
169
  context: EvolutionaryMCPContext,
170
170
  panel_style: str,
171
+ optimizer: Any = None,
171
172
  ) -> None:
172
173
  final_tools = (
173
174
  result.details.get("final_tools") if isinstance(result.details, dict) else None
174
175
  )
175
- tool_prompts = {
176
- (tool.get("function", {}).get("name") or tool.get("name")): tool.get(
177
- "function", {}
178
- ).get("description")
179
- for tool in (final_tools or [])
180
- }
176
+ # Use optimizer's centralized method if available, otherwise inline
177
+ if optimizer and hasattr(optimizer, "_extract_tool_prompts"):
178
+ tool_prompts = optimizer._extract_tool_prompts(final_tools) or {}
179
+ else:
180
+ tool_prompts = {
181
+ (tool.get("function", {}).get("name") or tool.get("name")): tool.get(
182
+ "function", {}
183
+ ).get("description")
184
+ for tool in (final_tools or [])
185
+ }
181
186
  if tool_prompts.get(context.tool_name):
182
187
  reporting.display_tool_description(
183
188
  tool_prompts[context.tool_name],
@@ -30,7 +30,11 @@ class MutationOps:
30
30
  self, individual: Any, initial_prompt: chat_prompt.ChatPrompt
31
31
  ) -> Any:
32
32
  """Enhanced mutation operation with multiple strategies."""
33
- prompt = chat_prompt.ChatPrompt(messages=individual)
33
+ prompt = chat_prompt.ChatPrompt(
34
+ messages=individual,
35
+ tools=initial_prompt.tools,
36
+ function_map=initial_prompt.function_map,
37
+ )
34
38
 
35
39
  mcp_context = getattr(self, "_mcp_context", None)
36
40
  if mcp_context is not None:
@@ -130,7 +134,11 @@ class MutationOps:
130
134
  f"Error parsing semantic mutation response as JSON. "
131
135
  f"Response: {response!r}\nOriginal error: {parse_exc}"
132
136
  ) from parse_exc
133
- return chat_prompt.ChatPrompt(messages=messages)
137
+ return chat_prompt.ChatPrompt(
138
+ messages=messages,
139
+ tools=prompt.tools,
140
+ function_map=prompt.function_map,
141
+ )
134
142
  except Exception as e:
135
143
  reporting.display_error(
136
144
  f" Error in semantic mutation, this is usually a parsing error: {e}",
@@ -187,7 +195,11 @@ class MutationOps:
187
195
  else:
188
196
  mutated_messages.append({"role": role, "content": content})
189
197
 
190
- return chat_prompt.ChatPrompt(messages=mutated_messages)
198
+ return chat_prompt.ChatPrompt(
199
+ messages=mutated_messages,
200
+ tools=prompt.tools,
201
+ function_map=prompt.function_map,
202
+ )
191
203
 
192
204
  def _word_level_mutation_prompt(
193
205
  self, prompt: chat_prompt.ChatPrompt
@@ -200,7 +212,11 @@ class MutationOps:
200
212
  "content": self._word_level_mutation(message["content"]),
201
213
  }
202
214
  )
203
- return chat_prompt.ChatPrompt(messages=mutated_messages)
215
+ return chat_prompt.ChatPrompt(
216
+ messages=mutated_messages,
217
+ tools=prompt.tools,
218
+ function_map=prompt.function_map,
219
+ )
204
220
 
205
221
  def _word_level_mutation(self, msg_content: str) -> str:
206
222
  """Perform word-level mutation."""
@@ -298,7 +314,11 @@ class MutationOps:
298
314
  f"Failed to parse LLM output in radical innovation mutation for prompt '{json.dumps(prompt.get_messages())[:50]}...'. Output: {new_prompt_str[:200]}. Error: {parse_exc}. Returning original."
299
315
  )
300
316
  return prompt
301
- return chat_prompt.ChatPrompt(messages=new_messages)
317
+ return chat_prompt.ChatPrompt(
318
+ messages=new_messages,
319
+ tools=prompt.tools,
320
+ function_map=prompt.function_map,
321
+ )
302
322
  except Exception as e:
303
323
  logger.warning(
304
324
  f"Radical innovation mutation failed for prompt '{json.dumps(prompt.get_messages())[:50]}...': {e}. Returning original."
@@ -88,13 +88,21 @@ class PopulationOps:
88
88
  p.get("role") is not None for p in fresh_prompts
89
89
  ):
90
90
  population.append(
91
- chat_prompt.ChatPrompt(messages=fresh_prompts)
91
+ chat_prompt.ChatPrompt(
92
+ messages=fresh_prompts,
93
+ tools=prompt.tools,
94
+ function_map=prompt.function_map,
95
+ )
92
96
  )
93
97
  init_pop_report.success_fresh_prompts(1)
94
98
  elif all(isinstance(p, list) for p in fresh_prompts):
95
99
  population.extend(
96
100
  [
97
- chat_prompt.ChatPrompt(messages=p)
101
+ chat_prompt.ChatPrompt(
102
+ messages=p,
103
+ tools=prompt.tools,
104
+ function_map=prompt.function_map,
105
+ )
98
106
  for p in fresh_prompts[:num_fresh_starts]
99
107
  ]
100
108
  )
@@ -157,7 +165,11 @@ class PopulationOps:
157
165
  )
158
166
  population.extend(
159
167
  [
160
- chat_prompt.ChatPrompt(messages=p)
168
+ chat_prompt.ChatPrompt(
169
+ messages=p,
170
+ tools=prompt.tools,
171
+ function_map=prompt.function_map,
172
+ )
161
173
  for p in generated_prompts_variations[
162
174
  :num_variations_on_initial
163
175
  ]
@@ -210,16 +222,20 @@ class PopulationOps:
210
222
  else:
211
223
  elites = tools.selBest(population, self.elitism_size)
212
224
 
213
- seed_prompt = (
214
- chat_prompt.ChatPrompt(
215
- messages=max(elites, key=lambda x: x.fitness.values[0])
225
+ if elites:
226
+ best_elite = max(elites, key=lambda x: x.fitness.values[0])
227
+ seed_prompt = chat_prompt.ChatPrompt(
228
+ messages=best_elite,
229
+ tools=getattr(best_elite, "tools", best_prompt_so_far.tools),
230
+ function_map=getattr(
231
+ best_elite, "function_map", best_prompt_so_far.function_map
232
+ ),
216
233
  )
217
- if elites
218
- else best_prompt_so_far
219
- )
234
+ else:
235
+ seed_prompt = best_prompt_so_far
220
236
 
221
237
  prompt_variants = self._initialize_population(seed_prompt)
222
- new_pop = [creator.Individual(p.get_messages()) for p in prompt_variants]
238
+ new_pop = [self._create_individual_from_prompt(p) for p in prompt_variants] # type: ignore[attr-defined]
223
239
 
224
240
  for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
225
241
  ind.fitness.values = fit