opik-optimizer 2.1.2__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -2
- opik_optimizer/base_optimizer.py +314 -145
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
- opik_optimizer/gepa_optimizer/reporting.py +164 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +221 -245
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +287 -132
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/multi_metric_objective.py +33 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +42 -15
- opik_optimizer/task_evaluator.py +26 -9
- opik_optimizer/utils/core.py +16 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +37 -37
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@ from .. import task_evaluator
|
|
|
6
6
|
from ..optimization_config import mappers, chat_prompt
|
|
7
7
|
from ..mcp_utils.mcp_workflow import MCPExecutionConfig
|
|
8
8
|
import opik
|
|
9
|
+
from opik import opik_context
|
|
9
10
|
import copy
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
@@ -15,7 +16,7 @@ if TYPE_CHECKING: # pragma: no cover - typing only
|
|
|
15
16
|
class EvaluationOps:
|
|
16
17
|
if TYPE_CHECKING:
|
|
17
18
|
agent_class: type[Any]
|
|
18
|
-
|
|
19
|
+
n_threads: int
|
|
19
20
|
|
|
20
21
|
def _evaluate_prompt(
|
|
21
22
|
self,
|
|
@@ -79,6 +80,16 @@ class EvaluationOps:
|
|
|
79
80
|
|
|
80
81
|
if mcp_execution_config is None:
|
|
81
82
|
model_output = agent.invoke(messages)
|
|
83
|
+
|
|
84
|
+
# Add tags to trace for optimization tracking
|
|
85
|
+
if (
|
|
86
|
+
hasattr(self, "current_optimization_id")
|
|
87
|
+
and self.current_optimization_id
|
|
88
|
+
):
|
|
89
|
+
opik_context.update_current_trace(
|
|
90
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
91
|
+
)
|
|
92
|
+
|
|
82
93
|
return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
|
|
83
94
|
|
|
84
95
|
coordinator = mcp_execution_config.coordinator
|
|
@@ -119,6 +130,15 @@ class EvaluationOps:
|
|
|
119
130
|
else:
|
|
120
131
|
final_response = raw_model_output
|
|
121
132
|
|
|
133
|
+
# Add tags to trace for optimization tracking
|
|
134
|
+
if (
|
|
135
|
+
hasattr(self, "current_optimization_id")
|
|
136
|
+
and self.current_optimization_id
|
|
137
|
+
):
|
|
138
|
+
opik_context.update_current_trace(
|
|
139
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
140
|
+
)
|
|
141
|
+
|
|
122
142
|
return {mappers.EVALUATED_LLM_TASK_OUTPUT: final_response.strip()}
|
|
123
143
|
|
|
124
144
|
score = task_evaluator.evaluate(
|
|
@@ -126,8 +146,8 @@ class EvaluationOps:
|
|
|
126
146
|
dataset_item_ids=dataset_item_ids,
|
|
127
147
|
metric=metric,
|
|
128
148
|
evaluated_task=llm_task,
|
|
129
|
-
num_threads=self.
|
|
130
|
-
project_name=
|
|
149
|
+
num_threads=self.n_threads,
|
|
150
|
+
project_name=optimizer.project_name,
|
|
131
151
|
n_samples=n_samples if dataset_item_ids is None else None,
|
|
132
152
|
experiment_config=experiment_config,
|
|
133
153
|
optimization_id=optimization_id,
|
|
@@ -31,7 +31,6 @@ from opik_optimizer.utils.prompt_segments import extract_prompt_segments
|
|
|
31
31
|
from .mcp import EvolutionaryMCPContext, finalize_mcp_result
|
|
32
32
|
|
|
33
33
|
from . import reporting
|
|
34
|
-
from .llm_support import LlmSupport
|
|
35
34
|
from .mutation_ops import MutationOps
|
|
36
35
|
from .crossover_ops import CrossoverOps
|
|
37
36
|
from .population_ops import PopulationOps
|
|
@@ -48,19 +47,41 @@ creator = cast(Any, _creator) # type: ignore[assignment]
|
|
|
48
47
|
|
|
49
48
|
class EvolutionaryOptimizer(BaseOptimizer):
|
|
50
49
|
"""
|
|
51
|
-
|
|
52
|
-
approach:
|
|
50
|
+
Evolutionary Optimizer that uses genetic algorithms to evolve and improve prompts over generations.
|
|
53
51
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
52
|
+
This optimizer uses a 4-stage genetic algorithm approach:
|
|
53
|
+
|
|
54
|
+
1. Generate candidate prompts through variations of the best prompts (exploitation) and
|
|
55
|
+
completely new prompts (exploration)
|
|
56
|
+
2. Evaluate the candidate prompts on the dataset
|
|
57
|
+
3. Select the best prompts based on fitness
|
|
58
|
+
4. Repeat until convergence or max generations reached
|
|
59
59
|
|
|
60
60
|
This algorithm is best used if you have a first draft prompt and would like to find a better
|
|
61
|
-
prompt.
|
|
61
|
+
prompt through iterative evolution. It supports both single-objective and multi-objective
|
|
62
|
+
optimization (balancing performance and prompt length).
|
|
62
63
|
|
|
63
64
|
Note: This algorithm is time consuming and can be expensive to run.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
model: LiteLLM model name for optimizer's internal operations (mutations, crossover, etc.)
|
|
68
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
69
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p.
|
|
70
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
71
|
+
population_size: Number of prompts in the population
|
|
72
|
+
num_generations: Number of generations to run
|
|
73
|
+
mutation_rate: Mutation rate for genetic operations
|
|
74
|
+
crossover_rate: Crossover rate for genetic operations
|
|
75
|
+
tournament_size: Tournament size for selection
|
|
76
|
+
elitism_size: Number of elite prompts to preserve across generations
|
|
77
|
+
adaptive_mutation: Whether to use adaptive mutation that adjusts based on population diversity
|
|
78
|
+
enable_moo: Whether to enable multi-objective optimization (optimizes metric and prompt length)
|
|
79
|
+
enable_llm_crossover: Whether to enable LLM-based crossover operations
|
|
80
|
+
output_style_guidance: Optional guidance for output style in generated prompts
|
|
81
|
+
infer_output_style: Whether to automatically infer output style from the dataset
|
|
82
|
+
n_threads: Number of threads for parallel evaluation
|
|
83
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
84
|
+
seed: Random seed for reproducibility
|
|
64
85
|
"""
|
|
65
86
|
|
|
66
87
|
DEFAULT_POPULATION_SIZE = 30
|
|
@@ -98,43 +119,23 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
98
119
|
|
|
99
120
|
def __init__(
|
|
100
121
|
self,
|
|
101
|
-
model: str,
|
|
122
|
+
model: str = "gpt-4o",
|
|
123
|
+
model_parameters: dict[str, Any] | None = None,
|
|
102
124
|
population_size: int = DEFAULT_POPULATION_SIZE,
|
|
103
125
|
num_generations: int = DEFAULT_NUM_GENERATIONS,
|
|
104
126
|
mutation_rate: float = DEFAULT_MUTATION_RATE,
|
|
105
127
|
crossover_rate: float = DEFAULT_CROSSOVER_RATE,
|
|
106
128
|
tournament_size: int = DEFAULT_TOURNAMENT_SIZE,
|
|
107
|
-
num_threads: int | None = None,
|
|
108
129
|
elitism_size: int = DEFAULT_ELITISM_SIZE,
|
|
109
130
|
adaptive_mutation: bool = DEFAULT_ADAPTIVE_MUTATION,
|
|
110
131
|
enable_moo: bool = DEFAULT_ENABLE_MOO,
|
|
111
132
|
enable_llm_crossover: bool = DEFAULT_ENABLE_LLM_CROSSOVER,
|
|
112
|
-
seed: int | None = DEFAULT_SEED,
|
|
113
133
|
output_style_guidance: str | None = None,
|
|
114
134
|
infer_output_style: bool = False,
|
|
115
|
-
verbose: int = 1,
|
|
116
135
|
n_threads: int = DEFAULT_NUM_THREADS,
|
|
117
|
-
|
|
136
|
+
verbose: int = 1,
|
|
137
|
+
seed: int = DEFAULT_SEED,
|
|
118
138
|
) -> None:
|
|
119
|
-
"""
|
|
120
|
-
Args:
|
|
121
|
-
model: The model to use for evaluation
|
|
122
|
-
population_size: Number of prompts in the population
|
|
123
|
-
num_generations: Number of generations to run
|
|
124
|
-
mutation_rate: Mutation rate for genetic operations
|
|
125
|
-
crossover_rate: Crossover rate for genetic operations
|
|
126
|
-
tournament_size: Tournament size for selection
|
|
127
|
-
n_threads: Number of threads for parallel evaluation
|
|
128
|
-
elitism_size: Number of elitism prompts
|
|
129
|
-
adaptive_mutation: Whether to use adaptive mutation
|
|
130
|
-
enable_moo: Whether to enable multi-objective optimization - When enable optimizes for both the supplied metric and the length of the prompt
|
|
131
|
-
enable_llm_crossover: Whether to enable LLM crossover
|
|
132
|
-
seed: Random seed for reproducibility
|
|
133
|
-
output_style_guidance: Output style guidance for prompts
|
|
134
|
-
infer_output_style: Whether to infer output style
|
|
135
|
-
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
|
136
|
-
**model_kwargs: Additional model parameters
|
|
137
|
-
"""
|
|
138
139
|
# Initialize base class first
|
|
139
140
|
if sys.version_info >= (3, 13):
|
|
140
141
|
warnings.warn(
|
|
@@ -142,42 +143,27 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
142
143
|
"You may see asyncio teardown warnings. Prefer Python 3.12.",
|
|
143
144
|
RuntimeWarning,
|
|
144
145
|
)
|
|
145
|
-
if "project_name" in model_kwargs:
|
|
146
|
-
warnings.warn(
|
|
147
|
-
"The 'project_name' parameter in optimizer constructor is deprecated. "
|
|
148
|
-
"Set project_name in the ChatPrompt instead.",
|
|
149
|
-
DeprecationWarning,
|
|
150
|
-
stacklevel=2,
|
|
151
|
-
)
|
|
152
|
-
del model_kwargs["project_name"]
|
|
153
146
|
|
|
154
|
-
super().__init__(
|
|
147
|
+
super().__init__(
|
|
148
|
+
model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
|
|
149
|
+
)
|
|
155
150
|
self.population_size = population_size
|
|
156
151
|
self.num_generations = num_generations
|
|
157
152
|
self.mutation_rate = mutation_rate
|
|
158
153
|
self.crossover_rate = crossover_rate
|
|
159
154
|
self.tournament_size = tournament_size
|
|
160
|
-
|
|
161
|
-
warnings.warn(
|
|
162
|
-
"The 'num_threads' parameter is deprecated and will be removed in a future version. "
|
|
163
|
-
"Use 'n_threads' instead.",
|
|
164
|
-
DeprecationWarning,
|
|
165
|
-
stacklevel=2,
|
|
166
|
-
)
|
|
167
|
-
n_threads = num_threads
|
|
168
|
-
self.num_threads = n_threads
|
|
155
|
+
self.n_threads = n_threads
|
|
169
156
|
self.elitism_size = elitism_size
|
|
170
157
|
self.adaptive_mutation = adaptive_mutation
|
|
171
158
|
self.enable_moo = enable_moo
|
|
172
159
|
self.enable_llm_crossover = enable_llm_crossover
|
|
173
|
-
self.seed = seed
|
|
160
|
+
self.seed = seed
|
|
174
161
|
self.output_style_guidance = (
|
|
175
162
|
output_style_guidance
|
|
176
163
|
if output_style_guidance is not None
|
|
177
164
|
else self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
|
178
165
|
)
|
|
179
166
|
self.infer_output_style = infer_output_style
|
|
180
|
-
self._current_optimization_id: str | None = None
|
|
181
167
|
self._current_generation = 0
|
|
182
168
|
self._best_fitness_history: list[float] = []
|
|
183
169
|
self._generations_without_improvement = 0
|
|
@@ -249,8 +235,8 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
249
235
|
func = getattr(cls, name)
|
|
250
236
|
setattr(self, name, func.__get__(self, self.__class__))
|
|
251
237
|
|
|
252
|
-
# LLM calls
|
|
253
|
-
bind(LlmSupport, ["_call_model"])
|
|
238
|
+
# LLM calls - now inherited from BaseOptimizer
|
|
239
|
+
# bind(LlmSupport, ["_call_model"]) # Removed - using BaseOptimizer._call_model
|
|
254
240
|
|
|
255
241
|
# Mutations
|
|
256
242
|
bind(
|
|
@@ -318,6 +304,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
318
304
|
) -> Any:
|
|
319
305
|
individual = creator.Individual(prompt_candidate.get_messages())
|
|
320
306
|
setattr(individual, "tools", copy.deepcopy(prompt_candidate.tools))
|
|
307
|
+
setattr(individual, "function_map", prompt_candidate.function_map)
|
|
321
308
|
return individual
|
|
322
309
|
|
|
323
310
|
def _update_individual_with_prompt(
|
|
@@ -325,6 +312,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
325
312
|
) -> Any:
|
|
326
313
|
individual[:] = prompt_candidate.get_messages()
|
|
327
314
|
setattr(individual, "tools", copy.deepcopy(prompt_candidate.tools))
|
|
315
|
+
setattr(individual, "function_map", prompt_candidate.function_map)
|
|
328
316
|
return individual
|
|
329
317
|
|
|
330
318
|
def _get_adaptive_mutation_rate(self) -> float:
|
|
@@ -415,13 +403,17 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
415
403
|
else:
|
|
416
404
|
elites = tools.selBest(population, self.elitism_size)
|
|
417
405
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
406
|
+
if elites:
|
|
407
|
+
best_elite = max(elites, key=lambda x: x.fitness.values[0])
|
|
408
|
+
seed_prompt = chat_prompt.ChatPrompt(
|
|
409
|
+
messages=best_elite,
|
|
410
|
+
tools=getattr(best_elite, "tools", best_prompt_so_far.tools),
|
|
411
|
+
function_map=getattr(
|
|
412
|
+
best_elite, "function_map", best_prompt_so_far.function_map
|
|
413
|
+
),
|
|
421
414
|
)
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
)
|
|
415
|
+
else:
|
|
416
|
+
seed_prompt = best_prompt_so_far
|
|
425
417
|
|
|
426
418
|
prompt_variants = self._initialize_population(seed_prompt)
|
|
427
419
|
new_pop = [self._create_individual_from_prompt(p) for p in prompt_variants]
|
|
@@ -522,6 +514,10 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
522
514
|
n_samples: int | None = None,
|
|
523
515
|
auto_continue: bool = False,
|
|
524
516
|
agent_class: type[OptimizableAgent] | None = None,
|
|
517
|
+
project_name: str = "Optimization",
|
|
518
|
+
max_trials: int = 10,
|
|
519
|
+
mcp_config: MCPExecutionConfig | None = None,
|
|
520
|
+
*args: Any,
|
|
525
521
|
**kwargs: Any,
|
|
526
522
|
) -> OptimizationResult:
|
|
527
523
|
"""
|
|
@@ -533,21 +529,18 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
533
529
|
n_samples: Optional number of samples to use
|
|
534
530
|
auto_continue: Whether to automatically continue optimization
|
|
535
531
|
agent_class: Optional agent class to use
|
|
536
|
-
|
|
537
|
-
|
|
532
|
+
project_name: Opik project name for logging traces (default: "Optimization")
|
|
533
|
+
mcp_config: MCP tool calling configuration (default: None)
|
|
538
534
|
"""
|
|
539
535
|
# Use base class validation and setup methods
|
|
540
|
-
self.
|
|
541
|
-
self.
|
|
542
|
-
self.agent_class = self.setup_agent_class(prompt, agent_class)
|
|
543
|
-
|
|
544
|
-
# Extract MCP config from kwargs (for optional MCP workflows)
|
|
545
|
-
mcp_config = kwargs.pop("mcp_config", None)
|
|
536
|
+
self._validate_optimization_inputs(prompt, dataset, metric)
|
|
537
|
+
self.agent_class = self._setup_agent_class(prompt, agent_class)
|
|
546
538
|
evaluation_kwargs: dict[str, Any] = {}
|
|
547
539
|
if mcp_config is not None:
|
|
548
540
|
evaluation_kwargs["mcp_config"] = mcp_config
|
|
549
541
|
|
|
550
|
-
|
|
542
|
+
# Set project name from parameter
|
|
543
|
+
self.project_name = project_name
|
|
551
544
|
|
|
552
545
|
# Step 0. Start Opik optimization run
|
|
553
546
|
opik_optimization_run: optimization.Optimization | None = None
|
|
@@ -557,14 +550,14 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
557
550
|
objective_name=metric.__name__,
|
|
558
551
|
metadata={"optimizer": self.__class__.__name__},
|
|
559
552
|
)
|
|
560
|
-
self.
|
|
553
|
+
self.current_optimization_id = opik_optimization_run.id
|
|
561
554
|
except Exception as e:
|
|
562
555
|
logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
|
|
563
|
-
self.
|
|
556
|
+
self.current_optimization_id = None
|
|
564
557
|
|
|
565
558
|
reporting.display_header(
|
|
566
559
|
algorithm=self.__class__.__name__,
|
|
567
|
-
optimization_id=self.
|
|
560
|
+
optimization_id=self.current_optimization_id,
|
|
568
561
|
dataset_id=dataset.id,
|
|
569
562
|
verbose=self.verbose,
|
|
570
563
|
)
|
|
@@ -583,7 +576,8 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
583
576
|
)
|
|
584
577
|
|
|
585
578
|
# Step 1. Step variables and define fitness function
|
|
586
|
-
self.
|
|
579
|
+
self._reset_counters() # Reset counters for run
|
|
580
|
+
trials_used = [0] # Use list for closure mutability
|
|
587
581
|
self._history: list[OptimizationRound] = []
|
|
588
582
|
self._current_generation = 0
|
|
589
583
|
self._best_fitness_history = []
|
|
@@ -595,7 +589,16 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
595
589
|
|
|
596
590
|
def _deap_evaluate_individual_fitness(
|
|
597
591
|
messages: list[dict[str, str]],
|
|
598
|
-
) -> tuple[float,
|
|
592
|
+
) -> tuple[float, ...]:
|
|
593
|
+
# Check if we've hit the limit
|
|
594
|
+
if trials_used[0] >= max_trials:
|
|
595
|
+
logger.debug(
|
|
596
|
+
f"Skipping evaluation - max_trials ({max_trials}) reached"
|
|
597
|
+
)
|
|
598
|
+
return (-float("inf"), float("inf")) # Worst possible fitness
|
|
599
|
+
|
|
600
|
+
trials_used[0] += 1
|
|
601
|
+
|
|
599
602
|
primary_fitness_score: float = self._evaluate_prompt(
|
|
600
603
|
prompt,
|
|
601
604
|
messages, # type: ignore
|
|
@@ -603,7 +606,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
603
606
|
metric=metric,
|
|
604
607
|
n_samples=n_samples,
|
|
605
608
|
experiment_config=(experiment_config or {}).copy(),
|
|
606
|
-
optimization_id=self.
|
|
609
|
+
optimization_id=self.current_optimization_id,
|
|
607
610
|
verbose=0,
|
|
608
611
|
**evaluation_kwargs,
|
|
609
612
|
)
|
|
@@ -614,7 +617,16 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
614
617
|
# Single-objective
|
|
615
618
|
def _deap_evaluate_individual_fitness(
|
|
616
619
|
messages: list[dict[str, str]],
|
|
617
|
-
) -> tuple[float,
|
|
620
|
+
) -> tuple[float, ...]:
|
|
621
|
+
# Check if we've hit the limit
|
|
622
|
+
if trials_used[0] >= max_trials:
|
|
623
|
+
logger.debug(
|
|
624
|
+
f"Skipping evaluation - max_trials ({max_trials}) reached"
|
|
625
|
+
)
|
|
626
|
+
return (-float("inf"),) # Worst possible fitness
|
|
627
|
+
|
|
628
|
+
trials_used[0] += 1
|
|
629
|
+
|
|
618
630
|
fitness_score: float = self._evaluate_prompt(
|
|
619
631
|
prompt,
|
|
620
632
|
messages, # type: ignore
|
|
@@ -622,11 +634,11 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
622
634
|
metric=metric,
|
|
623
635
|
n_samples=n_samples,
|
|
624
636
|
experiment_config=(experiment_config or {}).copy(),
|
|
625
|
-
optimization_id=self.
|
|
637
|
+
optimization_id=self.current_optimization_id,
|
|
626
638
|
verbose=0,
|
|
627
639
|
**evaluation_kwargs,
|
|
628
640
|
)
|
|
629
|
-
return (fitness_score,
|
|
641
|
+
return (fitness_score,)
|
|
630
642
|
|
|
631
643
|
self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
|
|
632
644
|
|
|
@@ -644,6 +656,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
644
656
|
else float(len(json.dumps(prompt.get_messages())))
|
|
645
657
|
)
|
|
646
658
|
|
|
659
|
+
trials_used[0] = 0
|
|
647
660
|
best_primary_score_overall = initial_primary_score
|
|
648
661
|
best_prompt_overall = prompt
|
|
649
662
|
report_baseline_performance.set_score(initial_primary_score)
|
|
@@ -715,14 +728,22 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
715
728
|
)
|
|
716
729
|
best_primary_score_overall = current_best_for_primary.fitness.values[0]
|
|
717
730
|
best_prompt_overall = chat_prompt.ChatPrompt(
|
|
718
|
-
messages=current_best_for_primary
|
|
731
|
+
messages=current_best_for_primary,
|
|
732
|
+
tools=getattr(current_best_for_primary, "tools", prompt.tools),
|
|
733
|
+
function_map=getattr(
|
|
734
|
+
current_best_for_primary, "function_map", prompt.function_map
|
|
735
|
+
),
|
|
719
736
|
)
|
|
720
737
|
else:
|
|
721
738
|
# Single-objective
|
|
722
739
|
current_best_on_front = hof[0]
|
|
723
740
|
best_primary_score_overall = current_best_on_front.fitness.values[0]
|
|
724
741
|
best_prompt_overall = chat_prompt.ChatPrompt(
|
|
725
|
-
messages=current_best_on_front
|
|
742
|
+
messages=current_best_on_front,
|
|
743
|
+
tools=getattr(current_best_on_front, "tools", prompt.tools),
|
|
744
|
+
function_map=getattr(
|
|
745
|
+
current_best_on_front, "function_map", prompt.function_map
|
|
746
|
+
),
|
|
726
747
|
)
|
|
727
748
|
|
|
728
749
|
if self.enable_moo:
|
|
@@ -756,6 +777,13 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
756
777
|
verbose=self.verbose
|
|
757
778
|
) as report_evolutionary_algo:
|
|
758
779
|
for generation_idx in range(1, self.num_generations + 1):
|
|
780
|
+
# Check if we've exhausted our evaluation budget
|
|
781
|
+
if trials_used[0] >= max_trials:
|
|
782
|
+
logger.info(
|
|
783
|
+
f"Stopping optimization: max_trials ({max_trials}) reached after {generation_idx - 1} generations"
|
|
784
|
+
)
|
|
785
|
+
break
|
|
786
|
+
|
|
759
787
|
report_evolutionary_algo.start_gen(generation_idx, self.num_generations)
|
|
760
788
|
|
|
761
789
|
curr_best_score = self._population_best_score(deap_population)
|
|
@@ -854,7 +882,11 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
854
882
|
final_results_log += f" Solution {i + 1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
|
|
855
883
|
best_overall_solution = sorted_hof[0]
|
|
856
884
|
final_best_prompt = chat_prompt.ChatPrompt(
|
|
857
|
-
messages=best_overall_solution
|
|
885
|
+
messages=best_overall_solution,
|
|
886
|
+
tools=getattr(best_overall_solution, "tools", prompt.tools),
|
|
887
|
+
function_map=getattr(
|
|
888
|
+
best_overall_solution, "function_map", prompt.function_map
|
|
889
|
+
),
|
|
858
890
|
)
|
|
859
891
|
final_primary_score = best_overall_solution.fitness.values[0]
|
|
860
892
|
final_length = best_overall_solution.fitness.values[1]
|
|
@@ -922,11 +954,12 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
922
954
|
)
|
|
923
955
|
|
|
924
956
|
logger.info(f"Total LLM calls during optimization: {self.llm_call_counter}")
|
|
957
|
+
logger.info(f"Total prompt evaluations: {trials_used[0]}")
|
|
925
958
|
if opik_optimization_run:
|
|
926
959
|
try:
|
|
927
960
|
opik_optimization_run.update(status="completed")
|
|
928
961
|
logger.info(
|
|
929
|
-
f"Opik Optimization run {self.
|
|
962
|
+
f"Opik Optimization run {self.current_optimization_id} status updated to completed."
|
|
930
963
|
)
|
|
931
964
|
except Exception as e:
|
|
932
965
|
logger.warning(f"Failed to update Opik Optimization run status: {e}")
|
|
@@ -952,13 +985,14 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
952
985
|
"seed": self.seed,
|
|
953
986
|
"prompt_type": "single_string_ga",
|
|
954
987
|
"initial_score_for_display": initial_score_for_display,
|
|
955
|
-
"temperature": self.
|
|
988
|
+
"temperature": self.model_parameters.get("temperature"),
|
|
956
989
|
"stopped_early": stopped_early_flag,
|
|
957
990
|
"rounds": self.get_history(),
|
|
958
991
|
"user_output_style_guidance": self.output_style_guidance,
|
|
959
992
|
"infer_output_style_requested": self.infer_output_style,
|
|
960
993
|
"final_effective_output_style_guidance": effective_output_style_guidance,
|
|
961
994
|
"infer_output_style": self.infer_output_style,
|
|
995
|
+
"trials_used": trials_used[0],
|
|
962
996
|
}
|
|
963
997
|
)
|
|
964
998
|
|
|
@@ -974,14 +1008,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
974
1008
|
final_tools = getattr(final_best_prompt, "tools", None)
|
|
975
1009
|
if final_tools:
|
|
976
1010
|
final_details["final_tools"] = final_tools
|
|
977
|
-
|
|
978
|
-
(tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
|
|
979
|
-
"function", {}
|
|
980
|
-
).get("description")
|
|
981
|
-
for idx, tool in enumerate(final_tools)
|
|
982
|
-
}
|
|
983
|
-
else:
|
|
984
|
-
tool_prompts = None
|
|
1011
|
+
tool_prompts = self._extract_tool_prompts(final_tools)
|
|
985
1012
|
|
|
986
1013
|
return OptimizationResult(
|
|
987
1014
|
optimizer=self.__class__.__name__,
|
|
@@ -995,7 +1022,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
995
1022
|
llm_calls=self.llm_call_counter,
|
|
996
1023
|
tool_calls=self.tool_call_counter,
|
|
997
1024
|
dataset_id=dataset.id,
|
|
998
|
-
optimization_id=self.
|
|
1025
|
+
optimization_id=self.current_optimization_id,
|
|
999
1026
|
tool_prompts=tool_prompts,
|
|
1000
1027
|
)
|
|
1001
1028
|
|
|
@@ -1090,7 +1117,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
1090
1117
|
self._mcp_context = previous_context
|
|
1091
1118
|
self.enable_llm_crossover = previous_crossover
|
|
1092
1119
|
|
|
1093
|
-
finalize_mcp_result(result, context, panel_style)
|
|
1120
|
+
finalize_mcp_result(result, context, panel_style, optimizer=self)
|
|
1094
1121
|
return result
|
|
1095
1122
|
|
|
1096
1123
|
# Evaluation is provided by EvaluationOps
|
|
@@ -1102,7 +1129,7 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
|
1102
1129
|
def _get_reasoning_system_prompt_for_variation(self) -> str:
|
|
1103
1130
|
return evo_prompts.variation_system_prompt(self.output_style_guidance)
|
|
1104
1131
|
|
|
1105
|
-
def
|
|
1132
|
+
def _get_llm_crossover_system_prompt(self) -> str:
|
|
1106
1133
|
return evo_prompts.llm_crossover_system_prompt(self.output_style_guidance)
|
|
1107
1134
|
|
|
1108
1135
|
def _get_radical_innovation_system_prompt(self) -> str:
|
|
@@ -168,16 +168,21 @@ def finalize_mcp_result(
|
|
|
168
168
|
result: Any,
|
|
169
169
|
context: EvolutionaryMCPContext,
|
|
170
170
|
panel_style: str,
|
|
171
|
+
optimizer: Any = None,
|
|
171
172
|
) -> None:
|
|
172
173
|
final_tools = (
|
|
173
174
|
result.details.get("final_tools") if isinstance(result.details, dict) else None
|
|
174
175
|
)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
176
|
+
# Use optimizer's centralized method if available, otherwise inline
|
|
177
|
+
if optimizer and hasattr(optimizer, "_extract_tool_prompts"):
|
|
178
|
+
tool_prompts = optimizer._extract_tool_prompts(final_tools) or {}
|
|
179
|
+
else:
|
|
180
|
+
tool_prompts = {
|
|
181
|
+
(tool.get("function", {}).get("name") or tool.get("name")): tool.get(
|
|
182
|
+
"function", {}
|
|
183
|
+
).get("description")
|
|
184
|
+
for tool in (final_tools or [])
|
|
185
|
+
}
|
|
181
186
|
if tool_prompts.get(context.tool_name):
|
|
182
187
|
reporting.display_tool_description(
|
|
183
188
|
tool_prompts[context.tool_name],
|
|
@@ -30,7 +30,11 @@ class MutationOps:
|
|
|
30
30
|
self, individual: Any, initial_prompt: chat_prompt.ChatPrompt
|
|
31
31
|
) -> Any:
|
|
32
32
|
"""Enhanced mutation operation with multiple strategies."""
|
|
33
|
-
prompt = chat_prompt.ChatPrompt(
|
|
33
|
+
prompt = chat_prompt.ChatPrompt(
|
|
34
|
+
messages=individual,
|
|
35
|
+
tools=initial_prompt.tools,
|
|
36
|
+
function_map=initial_prompt.function_map,
|
|
37
|
+
)
|
|
34
38
|
|
|
35
39
|
mcp_context = getattr(self, "_mcp_context", None)
|
|
36
40
|
if mcp_context is not None:
|
|
@@ -130,7 +134,11 @@ class MutationOps:
|
|
|
130
134
|
f"Error parsing semantic mutation response as JSON. "
|
|
131
135
|
f"Response: {response!r}\nOriginal error: {parse_exc}"
|
|
132
136
|
) from parse_exc
|
|
133
|
-
return chat_prompt.ChatPrompt(
|
|
137
|
+
return chat_prompt.ChatPrompt(
|
|
138
|
+
messages=messages,
|
|
139
|
+
tools=prompt.tools,
|
|
140
|
+
function_map=prompt.function_map,
|
|
141
|
+
)
|
|
134
142
|
except Exception as e:
|
|
135
143
|
reporting.display_error(
|
|
136
144
|
f" Error in semantic mutation, this is usually a parsing error: {e}",
|
|
@@ -187,7 +195,11 @@ class MutationOps:
|
|
|
187
195
|
else:
|
|
188
196
|
mutated_messages.append({"role": role, "content": content})
|
|
189
197
|
|
|
190
|
-
return chat_prompt.ChatPrompt(
|
|
198
|
+
return chat_prompt.ChatPrompt(
|
|
199
|
+
messages=mutated_messages,
|
|
200
|
+
tools=prompt.tools,
|
|
201
|
+
function_map=prompt.function_map,
|
|
202
|
+
)
|
|
191
203
|
|
|
192
204
|
def _word_level_mutation_prompt(
|
|
193
205
|
self, prompt: chat_prompt.ChatPrompt
|
|
@@ -200,7 +212,11 @@ class MutationOps:
|
|
|
200
212
|
"content": self._word_level_mutation(message["content"]),
|
|
201
213
|
}
|
|
202
214
|
)
|
|
203
|
-
return chat_prompt.ChatPrompt(
|
|
215
|
+
return chat_prompt.ChatPrompt(
|
|
216
|
+
messages=mutated_messages,
|
|
217
|
+
tools=prompt.tools,
|
|
218
|
+
function_map=prompt.function_map,
|
|
219
|
+
)
|
|
204
220
|
|
|
205
221
|
def _word_level_mutation(self, msg_content: str) -> str:
|
|
206
222
|
"""Perform word-level mutation."""
|
|
@@ -298,7 +314,11 @@ class MutationOps:
|
|
|
298
314
|
f"Failed to parse LLM output in radical innovation mutation for prompt '{json.dumps(prompt.get_messages())[:50]}...'. Output: {new_prompt_str[:200]}. Error: {parse_exc}. Returning original."
|
|
299
315
|
)
|
|
300
316
|
return prompt
|
|
301
|
-
return chat_prompt.ChatPrompt(
|
|
317
|
+
return chat_prompt.ChatPrompt(
|
|
318
|
+
messages=new_messages,
|
|
319
|
+
tools=prompt.tools,
|
|
320
|
+
function_map=prompt.function_map,
|
|
321
|
+
)
|
|
302
322
|
except Exception as e:
|
|
303
323
|
logger.warning(
|
|
304
324
|
f"Radical innovation mutation failed for prompt '{json.dumps(prompt.get_messages())[:50]}...': {e}. Returning original."
|
|
@@ -88,13 +88,21 @@ class PopulationOps:
|
|
|
88
88
|
p.get("role") is not None for p in fresh_prompts
|
|
89
89
|
):
|
|
90
90
|
population.append(
|
|
91
|
-
chat_prompt.ChatPrompt(
|
|
91
|
+
chat_prompt.ChatPrompt(
|
|
92
|
+
messages=fresh_prompts,
|
|
93
|
+
tools=prompt.tools,
|
|
94
|
+
function_map=prompt.function_map,
|
|
95
|
+
)
|
|
92
96
|
)
|
|
93
97
|
init_pop_report.success_fresh_prompts(1)
|
|
94
98
|
elif all(isinstance(p, list) for p in fresh_prompts):
|
|
95
99
|
population.extend(
|
|
96
100
|
[
|
|
97
|
-
chat_prompt.ChatPrompt(
|
|
101
|
+
chat_prompt.ChatPrompt(
|
|
102
|
+
messages=p,
|
|
103
|
+
tools=prompt.tools,
|
|
104
|
+
function_map=prompt.function_map,
|
|
105
|
+
)
|
|
98
106
|
for p in fresh_prompts[:num_fresh_starts]
|
|
99
107
|
]
|
|
100
108
|
)
|
|
@@ -157,7 +165,11 @@ class PopulationOps:
|
|
|
157
165
|
)
|
|
158
166
|
population.extend(
|
|
159
167
|
[
|
|
160
|
-
chat_prompt.ChatPrompt(
|
|
168
|
+
chat_prompt.ChatPrompt(
|
|
169
|
+
messages=p,
|
|
170
|
+
tools=prompt.tools,
|
|
171
|
+
function_map=prompt.function_map,
|
|
172
|
+
)
|
|
161
173
|
for p in generated_prompts_variations[
|
|
162
174
|
:num_variations_on_initial
|
|
163
175
|
]
|
|
@@ -210,16 +222,20 @@ class PopulationOps:
|
|
|
210
222
|
else:
|
|
211
223
|
elites = tools.selBest(population, self.elitism_size)
|
|
212
224
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
225
|
+
if elites:
|
|
226
|
+
best_elite = max(elites, key=lambda x: x.fitness.values[0])
|
|
227
|
+
seed_prompt = chat_prompt.ChatPrompt(
|
|
228
|
+
messages=best_elite,
|
|
229
|
+
tools=getattr(best_elite, "tools", best_prompt_so_far.tools),
|
|
230
|
+
function_map=getattr(
|
|
231
|
+
best_elite, "function_map", best_prompt_so_far.function_map
|
|
232
|
+
),
|
|
216
233
|
)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
)
|
|
234
|
+
else:
|
|
235
|
+
seed_prompt = best_prompt_so_far
|
|
220
236
|
|
|
221
237
|
prompt_variants = self._initialize_population(seed_prompt)
|
|
222
|
-
new_pop = [
|
|
238
|
+
new_pop = [self._create_individual_from_prompt(p) for p in prompt_variants] # type: ignore[attr-defined]
|
|
223
239
|
|
|
224
240
|
for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
|
|
225
241
|
ind.fitness.values = fit
|