opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +4 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +402 -28
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +154 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +30 -23
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +21 -16
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +22 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/utils/colbert.py +236 -0
- opik_optimizer/{utils.py → utils/core.py} +160 -33
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- opik_optimizer-2.0.0.dist-info/METADATA +345 -0
- opik_optimizer-2.0.0.dist-info/RECORD +74 -0
- opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.6.dist-info/METADATA +0 -181
- opik_optimizer-1.0.6.dist-info/RECORD +0 -50
- opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,47 @@
|
|
1
|
+
import copy
|
1
2
|
import json
|
2
3
|
import logging
|
3
|
-
import os
|
4
4
|
import random
|
5
|
-
from typing import Any,
|
5
|
+
from typing import Any, cast, TYPE_CHECKING
|
6
|
+
from collections.abc import Callable
|
7
|
+
import sys
|
8
|
+
import warnings
|
6
9
|
|
7
10
|
import rapidfuzz.distance.Indel
|
8
|
-
import litellm
|
9
11
|
import numpy as np
|
10
12
|
import opik
|
11
13
|
|
12
14
|
# DEAP imports
|
13
15
|
from deap import base, tools
|
14
16
|
from deap import creator as _creator
|
15
|
-
from
|
16
|
-
from litellm.caching import Cache
|
17
|
-
from litellm.types.caching import LiteLLMCacheType
|
18
|
-
from opik.api_objects import opik_client, optimization
|
17
|
+
from opik.api_objects import optimization
|
19
18
|
from opik.environment import get_tqdm_for_current_environment
|
20
|
-
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
21
19
|
|
22
|
-
from opik_optimizer import _throttle, task_evaluator
|
23
20
|
from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
|
24
|
-
from opik_optimizer.optimization_config import chat_prompt
|
21
|
+
from opik_optimizer.optimization_config import chat_prompt
|
25
22
|
from opik_optimizer.optimization_result import OptimizationResult
|
26
23
|
from opik_optimizer.optimizable_agent import OptimizableAgent
|
24
|
+
from opik_optimizer.mcp_utils.mcp_second_pass import MCPSecondPassCoordinator
|
25
|
+
from opik_optimizer.mcp_utils.mcp_workflow import (
|
26
|
+
MCPExecutionConfig,
|
27
|
+
extract_tool_arguments,
|
28
|
+
)
|
29
|
+
from opik_optimizer.utils.prompt_segments import extract_prompt_segments
|
30
|
+
|
31
|
+
from .mcp import EvolutionaryMCPContext, finalize_mcp_result
|
27
32
|
|
28
|
-
from .. import utils
|
29
33
|
from . import reporting
|
34
|
+
from .llm_support import LlmSupport
|
35
|
+
from .mutation_ops import MutationOps
|
36
|
+
from .crossover_ops import CrossoverOps
|
37
|
+
from .population_ops import PopulationOps
|
38
|
+
from .evaluation_ops import EvaluationOps
|
39
|
+
from .helpers import Helpers
|
40
|
+
from .style_ops import StyleOps
|
41
|
+
from . import prompts as evo_prompts
|
30
42
|
|
31
43
|
logger = logging.getLogger(__name__)
|
32
44
|
tqdm = get_tqdm_for_current_environment()
|
33
|
-
_rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
34
|
-
|
35
|
-
# Using disk cache for LLM calls
|
36
|
-
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
37
|
-
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
|
38
45
|
|
39
46
|
creator = cast(Any, _creator) # type: ignore[assignment]
|
40
47
|
|
@@ -80,19 +87,14 @@ class EvolutionaryOptimizer(BaseOptimizer):
|
|
80
87
|
)
|
81
88
|
DEFAULT_MOO_WEIGHTS = (1.0, -1.0) # (Maximize Score, Minimize Length)
|
82
89
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
Provide a single string that summarizes this style. This summary should be directly usable as an instruction for another LLM.
|
93
|
-
For example: 'Outputs should be a single, concise proper noun.' OR 'Outputs should be a short paragraph explaining the reasoning, followed by a direct answer, avoiding conversational pleasantries.' OR 'Outputs are typically 1-2 sentences, providing a direct factual answer.'
|
94
|
-
Return ONLY this descriptive string, with no preamble or extra formatting.
|
95
|
-
"""
|
90
|
+
# Prompt constants moved into prompts.py
|
91
|
+
if TYPE_CHECKING:
|
92
|
+
_llm_deap_crossover: Any
|
93
|
+
_deap_crossover: Any
|
94
|
+
_deap_mutation: Any
|
95
|
+
_initialize_population: Any
|
96
|
+
_evaluate_prompt: Any
|
97
|
+
_infer_output_style_from_dataset: Any
|
96
98
|
|
97
99
|
def __init__(
|
98
100
|
self,
|
@@ -102,13 +104,13 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
102
104
|
mutation_rate: float = DEFAULT_MUTATION_RATE,
|
103
105
|
crossover_rate: float = DEFAULT_CROSSOVER_RATE,
|
104
106
|
tournament_size: int = DEFAULT_TOURNAMENT_SIZE,
|
105
|
-
num_threads:
|
107
|
+
num_threads: int | None = None,
|
106
108
|
elitism_size: int = DEFAULT_ELITISM_SIZE,
|
107
109
|
adaptive_mutation: bool = DEFAULT_ADAPTIVE_MUTATION,
|
108
110
|
enable_moo: bool = DEFAULT_ENABLE_MOO,
|
109
111
|
enable_llm_crossover: bool = DEFAULT_ENABLE_LLM_CROSSOVER,
|
110
|
-
seed:
|
111
|
-
output_style_guidance:
|
112
|
+
seed: int | None = DEFAULT_SEED,
|
113
|
+
output_style_guidance: str | None = None,
|
112
114
|
infer_output_style: bool = False,
|
113
115
|
verbose: int = 1,
|
114
116
|
n_threads: int = DEFAULT_NUM_THREADS,
|
@@ -134,9 +136,18 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
134
136
|
**model_kwargs: Additional model parameters
|
135
137
|
"""
|
136
138
|
# Initialize base class first
|
139
|
+
if sys.version_info >= (3, 13):
|
140
|
+
warnings.warn(
|
141
|
+
"Python 3.13 is not officially supported (python_requires <3.13). "
|
142
|
+
"You may see asyncio teardown warnings. Prefer Python 3.12.",
|
143
|
+
RuntimeWarning,
|
144
|
+
)
|
137
145
|
if "project_name" in model_kwargs:
|
138
|
-
|
139
|
-
"
|
146
|
+
warnings.warn(
|
147
|
+
"The 'project_name' parameter in optimizer constructor is deprecated. "
|
148
|
+
"Set project_name in the ChatPrompt instead.",
|
149
|
+
DeprecationWarning,
|
150
|
+
stacklevel=2,
|
140
151
|
)
|
141
152
|
del model_kwargs["project_name"]
|
142
153
|
|
@@ -147,29 +158,32 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
147
158
|
self.crossover_rate = crossover_rate
|
148
159
|
self.tournament_size = tournament_size
|
149
160
|
if num_threads is not None:
|
150
|
-
|
161
|
+
warnings.warn(
|
162
|
+
"The 'num_threads' parameter is deprecated and will be removed in a future version. "
|
163
|
+
"Use 'n_threads' instead.",
|
164
|
+
DeprecationWarning,
|
165
|
+
stacklevel=2,
|
166
|
+
)
|
151
167
|
n_threads = num_threads
|
152
168
|
self.num_threads = n_threads
|
153
169
|
self.elitism_size = elitism_size
|
154
170
|
self.adaptive_mutation = adaptive_mutation
|
155
171
|
self.enable_moo = enable_moo
|
156
172
|
self.enable_llm_crossover = enable_llm_crossover
|
157
|
-
self.seed = seed
|
173
|
+
self.seed = seed if seed is not None else self.DEFAULT_SEED
|
158
174
|
self.output_style_guidance = (
|
159
175
|
output_style_guidance
|
160
176
|
if output_style_guidance is not None
|
161
177
|
else self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
162
178
|
)
|
163
179
|
self.infer_output_style = infer_output_style
|
164
|
-
self.
|
165
|
-
self._opik_client = opik_client.get_client_cached()
|
166
|
-
self._current_optimization_id: Optional[str] = None
|
180
|
+
self._current_optimization_id: str | None = None
|
167
181
|
self._current_generation = 0
|
168
|
-
self._best_fitness_history:
|
182
|
+
self._best_fitness_history: list[float] = []
|
169
183
|
self._generations_without_improvement = 0
|
170
|
-
self._current_population:
|
184
|
+
self._current_population: list[Any] = []
|
171
185
|
self._generations_without_overall_improvement = 0
|
172
|
-
self._best_primary_score_history:
|
186
|
+
self._best_primary_score_history: list[float] = []
|
173
187
|
self._gens_since_pop_improvement: int = 0
|
174
188
|
|
175
189
|
if self.seed is not None:
|
@@ -199,13 +213,9 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
199
213
|
creator.create("Individual", list, fitness=fitness_attr)
|
200
214
|
|
201
215
|
self.toolbox = base.Toolbox()
|
202
|
-
|
203
|
-
|
204
|
-
)
|
205
|
-
self.toolbox.register(
|
206
|
-
"population", tools.initRepeat, list, self.toolbox.default_individual
|
207
|
-
)
|
208
|
-
|
216
|
+
# Attach methods from helper mixin modules to this instance to avoid
|
217
|
+
# multiple inheritance while preserving behavior.
|
218
|
+
self._attach_helper_methods()
|
209
219
|
if self.enable_llm_crossover:
|
210
220
|
self.toolbox.register("mate", self._llm_deap_crossover)
|
211
221
|
else:
|
@@ -228,6 +238,95 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
228
238
|
f"mutation_rate: {self.mutation_rate}, crossover_rate: {self.crossover_rate}"
|
229
239
|
)
|
230
240
|
|
241
|
+
# (methods already attached above)
|
242
|
+
self._mcp_context: EvolutionaryMCPContext | None = None
|
243
|
+
|
244
|
+
def _attach_helper_methods(self) -> None:
|
245
|
+
"""Bind selected methods from mixin modules onto this instance."""
|
246
|
+
|
247
|
+
def bind(cls: Any, names: list[str]) -> None:
|
248
|
+
for name in names:
|
249
|
+
func = getattr(cls, name)
|
250
|
+
setattr(self, name, func.__get__(self, self.__class__))
|
251
|
+
|
252
|
+
# LLM calls
|
253
|
+
bind(LlmSupport, ["_call_model"])
|
254
|
+
|
255
|
+
# Mutations
|
256
|
+
bind(
|
257
|
+
MutationOps,
|
258
|
+
[
|
259
|
+
"_deap_mutation",
|
260
|
+
"_semantic_mutation",
|
261
|
+
"_structural_mutation",
|
262
|
+
"_word_level_mutation_prompt",
|
263
|
+
"_word_level_mutation",
|
264
|
+
"_get_synonym",
|
265
|
+
"_modify_phrase",
|
266
|
+
"_radical_innovation_mutation",
|
267
|
+
],
|
268
|
+
)
|
269
|
+
|
270
|
+
# Crossover
|
271
|
+
bind(
|
272
|
+
CrossoverOps,
|
273
|
+
[
|
274
|
+
"_deap_crossover_chunking_strategy",
|
275
|
+
"_deap_crossover_word_level",
|
276
|
+
"_deap_crossover",
|
277
|
+
"_llm_deap_crossover",
|
278
|
+
"_extract_json_arrays",
|
279
|
+
],
|
280
|
+
)
|
281
|
+
|
282
|
+
# Population management
|
283
|
+
bind(
|
284
|
+
PopulationOps,
|
285
|
+
[
|
286
|
+
"_initialize_population",
|
287
|
+
"_should_restart_population",
|
288
|
+
"_restart_population",
|
289
|
+
],
|
290
|
+
)
|
291
|
+
|
292
|
+
# Evaluation
|
293
|
+
bind(EvaluationOps, ["_evaluate_prompt"])
|
294
|
+
|
295
|
+
# Helpers
|
296
|
+
bind(Helpers, ["_get_task_description_for_llm"])
|
297
|
+
|
298
|
+
# Style inference
|
299
|
+
bind(StyleOps, ["_infer_output_style_from_dataset"])
|
300
|
+
|
301
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
302
|
+
return {
|
303
|
+
"population_size": self.population_size,
|
304
|
+
"num_generations": self.num_generations,
|
305
|
+
"mutation_rate": self.mutation_rate,
|
306
|
+
"crossover_rate": self.crossover_rate,
|
307
|
+
"tournament_size": self.tournament_size,
|
308
|
+
"elitism_size": self.elitism_size,
|
309
|
+
"adaptive_mutation": self.adaptive_mutation,
|
310
|
+
"enable_moo": self.enable_moo,
|
311
|
+
"enable_llm_crossover": self.enable_llm_crossover,
|
312
|
+
"infer_output_style": self.infer_output_style,
|
313
|
+
"output_style_guidance": self.output_style_guidance,
|
314
|
+
}
|
315
|
+
|
316
|
+
def _create_individual_from_prompt(
|
317
|
+
self, prompt_candidate: chat_prompt.ChatPrompt
|
318
|
+
) -> Any:
|
319
|
+
individual = creator.Individual(prompt_candidate.get_messages())
|
320
|
+
setattr(individual, "tools", copy.deepcopy(prompt_candidate.tools))
|
321
|
+
return individual
|
322
|
+
|
323
|
+
def _update_individual_with_prompt(
|
324
|
+
self, individual: Any, prompt_candidate: chat_prompt.ChatPrompt
|
325
|
+
) -> Any:
|
326
|
+
individual[:] = prompt_candidate.get_messages()
|
327
|
+
setattr(individual, "tools", copy.deepcopy(prompt_candidate.tools))
|
328
|
+
return individual
|
329
|
+
|
231
330
|
def _get_adaptive_mutation_rate(self) -> float:
|
232
331
|
"""Calculate adaptive mutation rate based on population diversity and progress."""
|
233
332
|
if not self.adaptive_mutation or len(self._best_fitness_history) < 2:
|
@@ -286,552 +385,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
|
|
286
385
|
|
287
386
|
return total_distance / count if count > 0 else 0.0
|
288
387
|
|
289
|
-
|
290
|
-
self, messages_1_str: str, messages_2_str: str
|
291
|
-
) -> Tuple[str, str]:
|
292
|
-
chunks1 = [
|
293
|
-
chunk.strip() for chunk in messages_1_str.split(".") if chunk.strip()
|
294
|
-
]
|
295
|
-
chunks2 = [
|
296
|
-
chunk.strip() for chunk in messages_2_str.split(".") if chunk.strip()
|
297
|
-
]
|
298
|
-
|
299
|
-
# Try chunk-level crossover if both parents have at least 2 chunks
|
300
|
-
if len(chunks1) >= 2 and len(chunks2) >= 2:
|
301
|
-
min_num_chunks = min(len(chunks1), len(chunks2))
|
302
|
-
# Crossover point is between 1 and min_num_chunks - 1
|
303
|
-
# This requires min_num_chunks >= 2, which is already checked.
|
304
|
-
point = random.randint(1, min_num_chunks - 1)
|
305
|
-
|
306
|
-
child1_chunks = chunks1[:point] + chunks2[point:]
|
307
|
-
child2_chunks = chunks2[:point] + chunks1[point:]
|
308
|
-
|
309
|
-
child1_str = ". ".join(child1_chunks) + ("." if child1_chunks else "")
|
310
|
-
child2_str = ". ".join(child2_chunks) + ("." if child2_chunks else "")
|
311
|
-
|
312
|
-
return child1_str, child2_str
|
313
|
-
else:
|
314
|
-
raise ValueError(
|
315
|
-
"Not enough chunks in either prompt for chunk-level crossover"
|
316
|
-
)
|
317
|
-
|
318
|
-
def _deap_crossover_word_level(
|
319
|
-
self, messages_1_str: str, messages_2_str: str
|
320
|
-
) -> Tuple[str, str]:
|
321
|
-
words1 = messages_1_str.split()
|
322
|
-
words2 = messages_2_str.split()
|
323
|
-
|
324
|
-
# If either prompt is empty (no words), return parents
|
325
|
-
if not words1 or not words2:
|
326
|
-
return messages_1_str, messages_2_str
|
327
|
-
|
328
|
-
min_word_len = min(len(words1), len(words2))
|
329
|
-
# Need at least 2 words in the shorter prompt for a valid crossover point
|
330
|
-
if min_word_len < 2:
|
331
|
-
return messages_1_str, messages_2_str
|
332
|
-
|
333
|
-
# Crossover point for words: 1 to min_word_len - 1
|
334
|
-
point = random.randint(1, min_word_len - 1)
|
335
|
-
child1_words = words1[:point] + words2[point:]
|
336
|
-
child2_words = words2[:point] + words1[point:]
|
337
|
-
|
338
|
-
return " ".join(child1_words), " ".join(child2_words)
|
339
|
-
|
340
|
-
def _deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
|
341
|
-
"""Enhanced crossover operation that preserves semantic meaning.
|
342
|
-
Attempts chunk-level crossover first, then falls back to word-level.
|
343
|
-
"""
|
344
|
-
reporting.display_message(
|
345
|
-
" Recombining prompts by mixing and matching words and sentences.",
|
346
|
-
verbose=self.verbose,
|
347
|
-
)
|
348
|
-
messages_1_orig: List[Dict[str, str]] = ind1
|
349
|
-
messages_2_orig: List[Dict[str, str]] = ind2
|
350
|
-
|
351
|
-
for i, message_1 in enumerate(messages_1_orig):
|
352
|
-
role: str = message_1["role"]
|
353
|
-
message_1_str: str = message_1["content"]
|
354
|
-
|
355
|
-
# We check that the second message has enough AI messages and the correct role
|
356
|
-
if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]["role"] == role):
|
357
|
-
message_2 = messages_2_orig[i]
|
358
|
-
message_2_str: str = message_2["content"]
|
359
|
-
|
360
|
-
try:
|
361
|
-
child1_str, child2_str = self._deap_crossover_chunking_strategy(
|
362
|
-
message_1_str, message_2_str
|
363
|
-
)
|
364
|
-
except ValueError:
|
365
|
-
child1_str, child2_str = self._deap_crossover_word_level(
|
366
|
-
message_1_str, message_2_str
|
367
|
-
)
|
368
|
-
|
369
|
-
# Update the message content
|
370
|
-
messages_1_orig[i]["content"] = child1_str
|
371
|
-
messages_2_orig[i]["content"] = child2_str
|
372
|
-
else:
|
373
|
-
# We don't perform any crossover if there are not enough messages or the roles
|
374
|
-
# don't match
|
375
|
-
pass
|
376
|
-
|
377
|
-
return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
|
378
|
-
|
379
|
-
def _deap_mutation(
|
380
|
-
self, individual: Any, initial_prompt: chat_prompt.ChatPrompt
|
381
|
-
) -> Any:
|
382
|
-
"""Enhanced mutation operation with multiple strategies."""
|
383
|
-
prompt = chat_prompt.ChatPrompt(messages=individual)
|
384
|
-
|
385
|
-
# Choose mutation strategy based on current diversity
|
386
|
-
diversity = self._calculate_population_diversity()
|
387
|
-
|
388
|
-
# Determine thresholds based on diversity
|
389
|
-
if diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
|
390
|
-
# Low diversity - use more aggressive mutations (higher chance for semantic)
|
391
|
-
semantic_threshold = 0.5
|
392
|
-
structural_threshold = 0.8 # semantic_threshold + 0.3
|
393
|
-
else:
|
394
|
-
# Good diversity - use more conservative mutations (higher chance for word_level)
|
395
|
-
semantic_threshold = 0.4
|
396
|
-
structural_threshold = 0.7 # semantic_threshold + 0.3
|
397
|
-
|
398
|
-
mutation_choice = random.random()
|
399
|
-
|
400
|
-
if mutation_choice > structural_threshold:
|
401
|
-
# This corresponds to the original 'else' (word_level_mutation)
|
402
|
-
mutated_prompt = self._word_level_mutation_prompt(prompt)
|
403
|
-
reporting.display_success(
|
404
|
-
" Mutation successful, prompt has been edited by randomizing words (word-level mutation).",
|
405
|
-
verbose=self.verbose,
|
406
|
-
)
|
407
|
-
return creator.Individual(mutated_prompt.get_messages())
|
408
|
-
elif mutation_choice > semantic_threshold:
|
409
|
-
# This corresponds to the original 'elif' (structural_mutation)
|
410
|
-
mutated_prompt = self._structural_mutation(prompt)
|
411
|
-
reporting.display_success(
|
412
|
-
" Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).",
|
413
|
-
verbose=self.verbose,
|
414
|
-
)
|
415
|
-
return creator.Individual(mutated_prompt.get_messages())
|
416
|
-
else:
|
417
|
-
# This corresponds to the original 'if' (semantic_mutation)
|
418
|
-
mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
|
419
|
-
reporting.display_success(
|
420
|
-
" Mutation successful, prompt has been edited using an LLM (semantic mutation).",
|
421
|
-
verbose=self.verbose,
|
422
|
-
)
|
423
|
-
return creator.Individual(mutated_prompt.get_messages())
|
424
|
-
|
425
|
-
def _semantic_mutation(
|
426
|
-
self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
|
427
|
-
) -> chat_prompt.ChatPrompt:
|
428
|
-
"""Enhanced semantic mutation with multiple strategies."""
|
429
|
-
current_output_style_guidance = self.output_style_guidance
|
430
|
-
if random.random() < 0.1:
|
431
|
-
return self._radical_innovation_mutation(prompt, initial_prompt)
|
432
|
-
|
433
|
-
try:
|
434
|
-
strategy = random.choice(
|
435
|
-
[
|
436
|
-
"rephrase",
|
437
|
-
"simplify",
|
438
|
-
"elaborate",
|
439
|
-
"restructure",
|
440
|
-
"focus",
|
441
|
-
"increase_complexity_and_detail",
|
442
|
-
]
|
443
|
-
)
|
444
|
-
|
445
|
-
strategy_prompts = {
|
446
|
-
"rephrase": f"Create a different way to express the same instruction, possibly with a different length or structure, ensuring it still aims for an answer from the target LLM in the style of: '{current_output_style_guidance}'.",
|
447
|
-
"simplify": f"Simplify the instruction while maintaining its core meaning, potentially making it more concise, to elicit an answer in the style of: '{current_output_style_guidance}'.",
|
448
|
-
"elaborate": f"Add more relevant detail and specificity to the instruction, potentially increasing its length, but only if it helps achieve a more accurate answer from the target LLM in the style of: '{current_output_style_guidance}'.",
|
449
|
-
"restructure": f"Change the structure of the instruction (e.g., reorder sentences, combine/split ideas) while keeping its intent, ensuring the new structure strongly guides towards an output in the style of: '{current_output_style_guidance}'.",
|
450
|
-
"focus": f"Emphasize the key aspects of the instruction, perhaps by rephrasing or adding clarifying statements, to better elicit an answer in the style of: '{current_output_style_guidance}'.",
|
451
|
-
"increase_complexity_and_detail": f"Significantly elaborate on this instruction. Add more details, examples, context, or constraints to make it more comprehensive. The goal of this elaboration is to make the prompt itself more detailed, so that it VERY CLEARLY guides the target LLM to produce a highly accurate final answer in the style of: '{current_output_style_guidance}'. The prompt can be long if needed to achieve this output style.",
|
452
|
-
}
|
453
|
-
|
454
|
-
user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
|
455
|
-
Task context: {self._get_task_description_for_llm(initial_prompt)}
|
456
|
-
Desired output style from target LLM: '{current_output_style_guidance}'
|
457
|
-
Instruction for this modification: {strategy_prompts[strategy]}.
|
458
|
-
Return only the modified prompt message list, nothing else. Make sure to return a valid JSON object.
|
459
|
-
"""
|
460
|
-
response = self._call_model(
|
461
|
-
messages=[
|
462
|
-
{
|
463
|
-
"role": "system",
|
464
|
-
"content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided.",
|
465
|
-
},
|
466
|
-
{"role": "user", "content": user_prompt_for_semantic_mutation},
|
467
|
-
],
|
468
|
-
is_reasoning=True,
|
469
|
-
)
|
470
|
-
|
471
|
-
return chat_prompt.ChatPrompt(messages=utils.json_to_dict(response.strip()))
|
472
|
-
except Exception as e:
|
473
|
-
reporting.display_error(
|
474
|
-
f" Error in semantic mutation, this is usually a parsing error: {e}",
|
475
|
-
verbose=self.verbose,
|
476
|
-
)
|
477
|
-
return prompt
|
478
|
-
|
479
|
-
def _structural_mutation(
|
480
|
-
self, prompt: chat_prompt.ChatPrompt
|
481
|
-
) -> chat_prompt.ChatPrompt:
|
482
|
-
"""Perform structural mutation (reordering, combining, splitting)."""
|
483
|
-
mutated_messages: List[Dict[str, str]] = []
|
484
|
-
|
485
|
-
for message in prompt.get_messages():
|
486
|
-
content = message["content"]
|
487
|
-
role = message["role"]
|
488
|
-
|
489
|
-
sentences = [s.strip() for s in content.split(".") if s.strip()]
|
490
|
-
if len(sentences) <= 1:
|
491
|
-
mutated_messages.append(
|
492
|
-
{"role": role, "content": self._word_level_mutation(content)}
|
493
|
-
)
|
494
|
-
continue
|
495
|
-
|
496
|
-
mutation_type = random.random()
|
497
|
-
if mutation_type < 0.3:
|
498
|
-
# Reorder sentences
|
499
|
-
random.shuffle(sentences)
|
500
|
-
mutated_messages.append(
|
501
|
-
{"role": role, "content": ". ".join(sentences) + "."}
|
502
|
-
)
|
503
|
-
continue
|
504
|
-
elif mutation_type < 0.6:
|
505
|
-
# Combine adjacent sentences
|
506
|
-
if len(sentences) >= 2:
|
507
|
-
idx = random.randint(0, len(sentences) - 2)
|
508
|
-
combined = sentences[idx] + " and " + sentences[idx + 1]
|
509
|
-
sentences[idx : idx + 2] = [combined]
|
510
|
-
mutated_messages.append(
|
511
|
-
{"role": role, "content": ". ".join(sentences) + "."}
|
512
|
-
)
|
513
|
-
continue
|
514
|
-
else:
|
515
|
-
# Split a sentence
|
516
|
-
idx = random.randint(0, len(sentences) - 1)
|
517
|
-
words = sentences[idx].split()
|
518
|
-
if len(words) > 3:
|
519
|
-
split_point = random.randint(2, len(words) - 2)
|
520
|
-
sentences[idx : idx + 1] = [
|
521
|
-
" ".join(words[:split_point]),
|
522
|
-
" ".join(words[split_point:]),
|
523
|
-
]
|
524
|
-
mutated_messages.append(
|
525
|
-
{"role": role, "content": ". ".join(sentences) + "."}
|
526
|
-
)
|
527
|
-
continue
|
528
|
-
else:
|
529
|
-
mutated_messages.append({"role": role, "content": content})
|
530
|
-
|
531
|
-
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
532
|
-
|
533
|
-
def _word_level_mutation_prompt(
|
534
|
-
self, prompt: chat_prompt.ChatPrompt
|
535
|
-
) -> chat_prompt.ChatPrompt:
|
536
|
-
mutated_messages: List[Dict[str, str]] = []
|
537
|
-
for message in prompt.get_messages():
|
538
|
-
mutated_messages.append(
|
539
|
-
{
|
540
|
-
"role": message["role"],
|
541
|
-
"content": self._word_level_mutation(message["content"]),
|
542
|
-
}
|
543
|
-
)
|
544
|
-
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
545
|
-
|
546
|
-
def _word_level_mutation(self, msg_content: str) -> str:
|
547
|
-
"""Perform word-level mutation."""
|
548
|
-
words = msg_content.split()
|
549
|
-
if len(words) <= 1:
|
550
|
-
return msg_content
|
551
|
-
|
552
|
-
mutation_type = random.random()
|
553
|
-
if mutation_type < 0.3:
|
554
|
-
# Word replacement
|
555
|
-
idx = random.randint(0, len(words) - 1)
|
556
|
-
words[idx] = self._get_synonym(words[idx])
|
557
|
-
elif mutation_type < 0.6:
|
558
|
-
# Word reordering
|
559
|
-
if len(words) > 2:
|
560
|
-
i, j = random.sample(range(len(words)), 2)
|
561
|
-
words[i], words[j] = words[j], words[i]
|
562
|
-
else:
|
563
|
-
# Phrase modification
|
564
|
-
idx = random.randint(0, len(words) - 1)
|
565
|
-
words[idx] = self._modify_phrase(words[idx])
|
566
|
-
|
567
|
-
return " ".join(words)
|
568
|
-
|
569
|
-
def _get_synonym(self, word: str) -> str:
|
570
|
-
"""Get a synonym for a word using LLM."""
|
571
|
-
try:
|
572
|
-
response = self._call_model(
|
573
|
-
messages=[
|
574
|
-
{
|
575
|
-
"role": "system",
|
576
|
-
"content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text.",
|
577
|
-
},
|
578
|
-
{
|
579
|
-
"role": "user",
|
580
|
-
"content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else.",
|
581
|
-
},
|
582
|
-
],
|
583
|
-
is_reasoning=True,
|
584
|
-
)
|
585
|
-
return response.strip()
|
586
|
-
except Exception as e:
|
587
|
-
logger.warning(f"Error getting synonym for '{word}': {e}")
|
588
|
-
return word
|
589
|
-
|
590
|
-
def _modify_phrase(self, phrase: str) -> str:
|
591
|
-
"""Modify a phrase while preserving meaning using LLM."""
|
592
|
-
try:
|
593
|
-
response = self._call_model(
|
594
|
-
messages=[
|
595
|
-
{
|
596
|
-
"role": "system",
|
597
|
-
"content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text.",
|
598
|
-
},
|
599
|
-
{
|
600
|
-
"role": "user",
|
601
|
-
"content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else.",
|
602
|
-
},
|
603
|
-
],
|
604
|
-
is_reasoning=True,
|
605
|
-
)
|
606
|
-
return response.strip()
|
607
|
-
except Exception as e:
|
608
|
-
logger.warning(f"Error modifying phrase '{phrase}': {e}")
|
609
|
-
return phrase
|
610
|
-
|
611
|
-
def _radical_innovation_mutation(
|
612
|
-
self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
|
613
|
-
) -> chat_prompt.ChatPrompt:
|
614
|
-
"""Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
|
615
|
-
logger.debug(
|
616
|
-
f"Attempting radical innovation for prompt: {json.dumps(prompt.get_messages())[:70]}..."
|
617
|
-
)
|
618
|
-
task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
|
619
|
-
current_output_style_guidance = self.output_style_guidance
|
620
|
-
|
621
|
-
user_prompt_for_radical_innovation = f"""Task Context:
|
622
|
-
{task_desc_for_llm}
|
623
|
-
Desired output style from target LLM: '{current_output_style_guidance}'
|
624
|
-
|
625
|
-
Existing Prompt (which may be underperforming):
|
626
|
-
'''{prompt.get_messages()}'''
|
627
|
-
|
628
|
-
Please generate a new, significantly improved, and potentially very different prompt for this task.
|
629
|
-
Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
|
630
|
-
Return only the new prompt list object.
|
631
|
-
"""
|
632
|
-
try:
|
633
|
-
new_prompt_str = self._call_model(
|
634
|
-
messages=[
|
635
|
-
{
|
636
|
-
"role": "system",
|
637
|
-
"content": self._get_radical_innovation_system_prompt(),
|
638
|
-
},
|
639
|
-
{"role": "user", "content": user_prompt_for_radical_innovation},
|
640
|
-
],
|
641
|
-
is_reasoning=True,
|
642
|
-
)
|
643
|
-
logger.info(
|
644
|
-
f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.get_messages())[:70]}..."
|
645
|
-
)
|
646
|
-
return chat_prompt.ChatPrompt(messages=json.loads(new_prompt_str))
|
647
|
-
except Exception as e:
|
648
|
-
logger.warning(
|
649
|
-
f"Radical innovation mutation failed for prompt '{json.dumps(prompt.get_messages())[:50]}...': {e}. Returning original."
|
650
|
-
)
|
651
|
-
return prompt
|
652
|
-
|
653
|
-
def _initialize_population(
|
654
|
-
self, prompt: chat_prompt.ChatPrompt
|
655
|
-
) -> List[chat_prompt.ChatPrompt]:
|
656
|
-
"""Initialize the population with diverse variations of the initial prompt,
|
657
|
-
including some 'fresh start' prompts based purely on task description.
|
658
|
-
All generated prompts should aim to elicit answers matching self.output_style_guidance.
|
659
|
-
"""
|
660
|
-
with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
|
661
|
-
init_pop_report.start(self.population_size)
|
662
|
-
|
663
|
-
population = [prompt]
|
664
|
-
if self.population_size <= 1:
|
665
|
-
return population
|
666
|
-
|
667
|
-
num_to_generate_total = self.population_size - 1
|
668
|
-
num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
|
669
|
-
num_variations_on_initial = num_to_generate_total - num_fresh_starts
|
670
|
-
|
671
|
-
task_desc_for_llm = self._get_task_description_for_llm(prompt)
|
672
|
-
current_output_style_guidance = self.output_style_guidance
|
673
|
-
|
674
|
-
# Generate "fresh start" prompts if the initial prompt is not performing well
|
675
|
-
# Cold start prompts are generated from the task description
|
676
|
-
if num_fresh_starts > 0:
|
677
|
-
init_pop_report.start_fresh_prompts(num_fresh_starts)
|
678
|
-
fresh_start_user_prompt = f"""Here is a description of a task:
|
679
|
-
{task_desc_for_llm}
|
680
|
-
|
681
|
-
The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
|
682
|
-
|
683
|
-
Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
|
684
|
-
Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
|
685
|
-
|
686
|
-
Example of valid response: [
|
687
|
-
["role": "<role>", "content": "<Prompt targeting specified style.>"],
|
688
|
-
["role": "<role>", "content": "<Another prompt designed for the output style.>"]
|
689
|
-
]
|
690
|
-
|
691
|
-
Your response MUST be a valid JSON list of AI messages. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
|
692
|
-
|
693
|
-
"""
|
694
|
-
try:
|
695
|
-
response_content = self._call_model(
|
696
|
-
messages=[
|
697
|
-
{
|
698
|
-
"role": "system",
|
699
|
-
"content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
|
700
|
-
},
|
701
|
-
{"role": "user", "content": fresh_start_user_prompt},
|
702
|
-
],
|
703
|
-
is_reasoning=True,
|
704
|
-
)
|
705
|
-
|
706
|
-
logger.debug(
|
707
|
-
f"Raw LLM response for fresh start prompts: {response_content}"
|
708
|
-
)
|
709
|
-
|
710
|
-
fresh_prompts = utils.json_to_dict(response_content)
|
711
|
-
if isinstance(fresh_prompts, list):
|
712
|
-
if all(isinstance(p, dict) for p in fresh_prompts) and all(
|
713
|
-
p.get("role") is not None for p in fresh_prompts
|
714
|
-
):
|
715
|
-
population.append(
|
716
|
-
chat_prompt.ChatPrompt(messages=fresh_prompts)
|
717
|
-
)
|
718
|
-
init_pop_report.success_fresh_prompts(1)
|
719
|
-
elif all(isinstance(p, list) for p in fresh_prompts):
|
720
|
-
population.extend(
|
721
|
-
[
|
722
|
-
chat_prompt.ChatPrompt(messages=p)
|
723
|
-
for p in fresh_prompts[:num_fresh_starts]
|
724
|
-
]
|
725
|
-
)
|
726
|
-
init_pop_report.success_fresh_prompts(
|
727
|
-
len(fresh_prompts[:num_fresh_starts])
|
728
|
-
)
|
729
|
-
else:
|
730
|
-
init_pop_report.failed_fresh_prompts(
|
731
|
-
num_fresh_starts,
|
732
|
-
f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts.",
|
733
|
-
)
|
734
|
-
except json.JSONDecodeError as e_json:
|
735
|
-
init_pop_report.failed_fresh_prompts(
|
736
|
-
num_fresh_starts,
|
737
|
-
f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts.",
|
738
|
-
)
|
739
|
-
except Exception as e:
|
740
|
-
init_pop_report.failed_fresh_prompts(
|
741
|
-
num_fresh_starts,
|
742
|
-
f"Error generating fresh start prompts: {e}. Skipping fresh start prompts.",
|
743
|
-
)
|
744
|
-
|
745
|
-
# Generate variations on the initial prompt for the remaining slots
|
746
|
-
# TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
|
747
|
-
if num_variations_on_initial > 0:
|
748
|
-
init_pop_report.start_variations(num_variations_on_initial)
|
749
|
-
|
750
|
-
# TODO: We need to split this into batches as the model will not return enough tokens
|
751
|
-
# to generate all the candidates
|
752
|
-
user_prompt_for_variation = f"""Initial prompt:
|
753
|
-
'''{prompt.get_messages()}'''
|
754
|
-
|
755
|
-
Task context:
|
756
|
-
{task_desc_for_llm}
|
757
|
-
Desired output style from target LLM: '{current_output_style_guidance}'
|
758
|
-
|
759
|
-
Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
|
760
|
-
All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
|
761
|
-
For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
|
762
|
-
|
763
|
-
Return a JSON array of prompts with the following structure:
|
764
|
-
{{
|
765
|
-
"prompts": [
|
766
|
-
{{
|
767
|
-
"prompt": [{{"role": "<role>", "content": "<content>"}}],
|
768
|
-
"strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
|
769
|
-
}}
|
770
|
-
// ... more prompts if num_variations_on_initial > 1
|
771
|
-
]
|
772
|
-
}}
|
773
|
-
Ensure a good mix of variations, all targeting the specified output style from the end LLM.
|
774
|
-
|
775
|
-
Return a valid JSON object that is correctly escaped. Return nothing else, d`o not include any additional text or Markdown formatting.
|
776
|
-
"""
|
777
|
-
try:
|
778
|
-
response_content_variations = self._call_model(
|
779
|
-
messages=[
|
780
|
-
{
|
781
|
-
"role": "system",
|
782
|
-
"content": self._get_reasoning_system_prompt_for_variation(),
|
783
|
-
},
|
784
|
-
{"role": "user", "content": user_prompt_for_variation},
|
785
|
-
],
|
786
|
-
is_reasoning=True,
|
787
|
-
)
|
788
|
-
logger.debug(
|
789
|
-
f"Raw response for population variations: {response_content_variations}"
|
790
|
-
)
|
791
|
-
json_response_variations = json.loads(response_content_variations)
|
792
|
-
generated_prompts_variations = [
|
793
|
-
p["prompt"]
|
794
|
-
for p in json_response_variations.get("prompts", [])
|
795
|
-
if isinstance(p, dict) and "prompt" in p
|
796
|
-
]
|
797
|
-
|
798
|
-
if generated_prompts_variations:
|
799
|
-
init_pop_report.success_variations(
|
800
|
-
len(
|
801
|
-
generated_prompts_variations[:num_variations_on_initial]
|
802
|
-
)
|
803
|
-
)
|
804
|
-
population.extend(
|
805
|
-
[
|
806
|
-
chat_prompt.ChatPrompt(messages=p)
|
807
|
-
for p in generated_prompts_variations[
|
808
|
-
:num_variations_on_initial
|
809
|
-
]
|
810
|
-
]
|
811
|
-
)
|
812
|
-
else:
|
813
|
-
init_pop_report.failed_variations(
|
814
|
-
num_variations_on_initial,
|
815
|
-
"Could not parse 'prompts' list for variations. Skipping variations.",
|
816
|
-
)
|
817
|
-
except Exception as e:
|
818
|
-
init_pop_report.failed_variations(
|
819
|
-
num_variations_on_initial,
|
820
|
-
f"Error calling LLM for initial population variations: {e}",
|
821
|
-
)
|
822
|
-
|
823
|
-
# Ensure population is of the required size using unique prompts
|
824
|
-
# TODO Test with levenshtein distance
|
825
|
-
final_population_set: Set[str] = set()
|
826
|
-
final_population_list: List[chat_prompt.ChatPrompt] = []
|
827
|
-
for p in population:
|
828
|
-
if json.dumps(p.get_messages()) not in final_population_set:
|
829
|
-
final_population_set.add(json.dumps(p.get_messages()))
|
830
|
-
final_population_list.append(p)
|
831
|
-
|
832
|
-
init_pop_report.end(final_population_list)
|
833
|
-
# Return exactly population_size prompts if possible, or fewer if generation failed badly.
|
834
|
-
return final_population_list[: self.population_size]
|
388
|
+
# Mutations and helpers are implemented in mixins.
|
835
389
|
|
836
390
|
def _should_restart_population(self, curr_best: float) -> bool:
|
837
391
|
"""
|
@@ -852,9 +406,9 @@ Return only the new prompt list object.
|
|
852
406
|
def _restart_population(
|
853
407
|
self,
|
854
408
|
hof: tools.HallOfFame,
|
855
|
-
population:
|
409
|
+
population: list[Any],
|
856
410
|
best_prompt_so_far: chat_prompt.ChatPrompt,
|
857
|
-
) ->
|
411
|
+
) -> list[Any]:
|
858
412
|
"""Return a fresh, evaluated population seeded by elites."""
|
859
413
|
if self.enable_moo:
|
860
414
|
elites = list(hof)
|
@@ -870,7 +424,7 @@ Return only the new prompt list object.
|
|
870
424
|
)
|
871
425
|
|
872
426
|
prompt_variants = self._initialize_population(seed_prompt)
|
873
|
-
new_pop = [
|
427
|
+
new_pop = [self._create_individual_from_prompt(p) for p in prompt_variants]
|
874
428
|
|
875
429
|
for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
|
876
430
|
ind.fitness.values = fit
|
@@ -881,12 +435,12 @@ Return only the new prompt list object.
|
|
881
435
|
def _run_generation(
|
882
436
|
self,
|
883
437
|
generation_idx: int,
|
884
|
-
population:
|
438
|
+
population: list[Any],
|
885
439
|
prompt: chat_prompt.ChatPrompt,
|
886
440
|
hof: tools.HallOfFame,
|
887
441
|
report: Any,
|
888
442
|
best_primary_score_overall: float,
|
889
|
-
) -> tuple[
|
443
|
+
) -> tuple[list[Any], int]:
|
890
444
|
"""Execute mating, mutation, evaluation and HoF update."""
|
891
445
|
best_gen_score = 0.0
|
892
446
|
|
@@ -952,7 +506,7 @@ Return only the new prompt list object.
|
|
952
506
|
|
953
507
|
return offspring, len(invalid)
|
954
508
|
|
955
|
-
def _population_best_score(self, population:
|
509
|
+
def _population_best_score(self, population: list[Any]) -> float:
|
956
510
|
"""Return highest primary-objective score among *valid* individuals."""
|
957
511
|
valid_scores = [
|
958
512
|
ind.fitness.values[0] for ind in population if ind.fitness.valid
|
@@ -964,10 +518,10 @@ Return only the new prompt list object.
|
|
964
518
|
prompt: chat_prompt.ChatPrompt,
|
965
519
|
dataset: opik.Dataset,
|
966
520
|
metric: Callable,
|
967
|
-
experiment_config:
|
968
|
-
n_samples:
|
521
|
+
experiment_config: dict | None = None,
|
522
|
+
n_samples: int | None = None,
|
969
523
|
auto_continue: bool = False,
|
970
|
-
agent_class:
|
524
|
+
agent_class: type[OptimizableAgent] | None = None,
|
971
525
|
**kwargs: Any,
|
972
526
|
) -> OptimizationResult:
|
973
527
|
"""
|
@@ -978,35 +532,27 @@ Return only the new prompt list object.
|
|
978
532
|
experiment_config: Optional experiment configuration
|
979
533
|
n_samples: Optional number of samples to use
|
980
534
|
auto_continue: Whether to automatically continue optimization
|
981
|
-
|
535
|
+
agent_class: Optional agent class to use
|
536
|
+
**kwargs: Additional keyword arguments including:
|
537
|
+
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
982
538
|
"""
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
raise ValueError("Dataset must be a Dataset object")
|
539
|
+
# Use base class validation and setup methods
|
540
|
+
self.validate_optimization_inputs(prompt, dataset, metric)
|
541
|
+
self.configure_prompt_model(prompt)
|
542
|
+
self.agent_class = self.setup_agent_class(prompt, agent_class)
|
988
543
|
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
if prompt.model is None:
|
995
|
-
prompt.model = self.model
|
996
|
-
if prompt.model_kwargs is None:
|
997
|
-
prompt.model_kwargs = self.model_kwargs
|
998
|
-
|
999
|
-
if agent_class is None:
|
1000
|
-
self.agent_class = utils.create_litellm_agent_class(prompt)
|
1001
|
-
else:
|
1002
|
-
self.agent_class = agent_class
|
544
|
+
# Extract MCP config from kwargs (for optional MCP workflows)
|
545
|
+
mcp_config = kwargs.pop("mcp_config", None)
|
546
|
+
evaluation_kwargs: dict[str, Any] = {}
|
547
|
+
if mcp_config is not None:
|
548
|
+
evaluation_kwargs["mcp_config"] = mcp_config
|
1003
549
|
|
1004
550
|
self.project_name = self.agent_class.project_name
|
1005
551
|
|
1006
552
|
# Step 0. Start Opik optimization run
|
1007
|
-
opik_optimization_run:
|
553
|
+
opik_optimization_run: optimization.Optimization | None = None
|
1008
554
|
try:
|
1009
|
-
opik_optimization_run = self.
|
555
|
+
opik_optimization_run = self.opik_client.create_optimization(
|
1010
556
|
dataset_name=dataset.name,
|
1011
557
|
objective_name=metric.__name__,
|
1012
558
|
metadata={"optimizer": self.__class__.__name__},
|
@@ -1026,18 +572,19 @@ Return only the new prompt list object.
|
|
1026
572
|
reporting.display_configuration(
|
1027
573
|
prompt.get_messages(),
|
1028
574
|
{
|
1029
|
-
"optimizer": f"{
|
575
|
+
"optimizer": f"{'DEAP MOO' if self.enable_moo else 'DEAP SO'} Evolutionary Optimization",
|
1030
576
|
"population_size": self.population_size,
|
1031
577
|
"generations": self.num_generations,
|
1032
578
|
"mutation_rate": self.mutation_rate,
|
1033
579
|
"crossover_rate": self.crossover_rate,
|
1034
580
|
},
|
1035
581
|
verbose=self.verbose,
|
582
|
+
tools=getattr(prompt, "tools", None),
|
1036
583
|
)
|
1037
584
|
|
1038
585
|
# Step 1. Step variables and define fitness function
|
1039
|
-
self.
|
1040
|
-
self._history:
|
586
|
+
self.reset_counters() # Reset counters for run
|
587
|
+
self._history: list[OptimizationRound] = []
|
1041
588
|
self._current_generation = 0
|
1042
589
|
self._best_fitness_history = []
|
1043
590
|
self._generations_without_improvement = 0
|
@@ -1047,8 +594,8 @@ Return only the new prompt list object.
|
|
1047
594
|
if self.enable_moo:
|
1048
595
|
|
1049
596
|
def _deap_evaluate_individual_fitness(
|
1050
|
-
messages:
|
1051
|
-
) ->
|
597
|
+
messages: list[dict[str, str]],
|
598
|
+
) -> tuple[float, float]:
|
1052
599
|
primary_fitness_score: float = self._evaluate_prompt(
|
1053
600
|
prompt,
|
1054
601
|
messages, # type: ignore
|
@@ -1058,6 +605,7 @@ Return only the new prompt list object.
|
|
1058
605
|
experiment_config=(experiment_config or {}).copy(),
|
1059
606
|
optimization_id=self._current_optimization_id,
|
1060
607
|
verbose=0,
|
608
|
+
**evaluation_kwargs,
|
1061
609
|
)
|
1062
610
|
prompt_length = float(len(str(json.dumps(messages))))
|
1063
611
|
return (primary_fitness_score, prompt_length)
|
@@ -1065,8 +613,8 @@ Return only the new prompt list object.
|
|
1065
613
|
else:
|
1066
614
|
# Single-objective
|
1067
615
|
def _deap_evaluate_individual_fitness(
|
1068
|
-
messages:
|
1069
|
-
) ->
|
616
|
+
messages: list[dict[str, str]],
|
617
|
+
) -> tuple[float, float]:
|
1070
618
|
fitness_score: float = self._evaluate_prompt(
|
1071
619
|
prompt,
|
1072
620
|
messages, # type: ignore
|
@@ -1076,6 +624,7 @@ Return only the new prompt list object.
|
|
1076
624
|
experiment_config=(experiment_config or {}).copy(),
|
1077
625
|
optimization_id=self._current_optimization_id,
|
1078
626
|
verbose=0,
|
627
|
+
**evaluation_kwargs,
|
1079
628
|
)
|
1080
629
|
return (fitness_score, 0.0)
|
1081
630
|
|
@@ -1123,12 +672,12 @@ Return only the new prompt list object.
|
|
1123
672
|
self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
|
1124
673
|
|
1125
674
|
# Step 4. Initialize population
|
1126
|
-
initial_prompts:
|
675
|
+
initial_prompts: list[chat_prompt.ChatPrompt] = self._initialize_population(
|
1127
676
|
prompt=prompt
|
1128
677
|
)
|
1129
678
|
|
1130
679
|
deap_population = [
|
1131
|
-
|
680
|
+
self._create_individual_from_prompt(p) for p in initial_prompts
|
1132
681
|
]
|
1133
682
|
deap_population = deap_population[: self.population_size]
|
1134
683
|
|
@@ -1143,7 +692,7 @@ Return only the new prompt list object.
|
|
1143
692
|
with reporting.evaluate_initial_population(
|
1144
693
|
verbose=self.verbose
|
1145
694
|
) as report_initial_population:
|
1146
|
-
fitnesses:
|
695
|
+
fitnesses: list[Any] = list(map(self.toolbox.evaluate, deap_population))
|
1147
696
|
_best_score = max(
|
1148
697
|
best_primary_score_overall, max([x[0] for x in fitnesses])
|
1149
698
|
)
|
@@ -1302,7 +851,7 @@ Return only the new prompt list object.
|
|
1302
851
|
hof, key=lambda ind: ind.fitness.values[0], reverse=True
|
1303
852
|
)
|
1304
853
|
for i, sol in enumerate(sorted_hof):
|
1305
|
-
final_results_log += f" Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
|
854
|
+
final_results_log += f" Solution {i + 1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
|
1306
855
|
best_overall_solution = sorted_hof[0]
|
1307
856
|
final_best_prompt = chat_prompt.ChatPrompt(
|
1308
857
|
messages=best_overall_solution
|
@@ -1419,7 +968,21 @@ Return only the new prompt list object.
|
|
1419
968
|
best_score=final_primary_score,
|
1420
969
|
best_prompt=final_best_prompt.get_messages(),
|
1421
970
|
verbose=self.verbose,
|
971
|
+
tools=getattr(final_best_prompt, "tools", None),
|
1422
972
|
)
|
973
|
+
|
974
|
+
final_tools = getattr(final_best_prompt, "tools", None)
|
975
|
+
if final_tools:
|
976
|
+
final_details["final_tools"] = final_tools
|
977
|
+
tool_prompts = {
|
978
|
+
(tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
|
979
|
+
"function", {}
|
980
|
+
).get("description")
|
981
|
+
for idx, tool in enumerate(final_tools)
|
982
|
+
}
|
983
|
+
else:
|
984
|
+
tool_prompts = None
|
985
|
+
|
1423
986
|
return OptimizationResult(
|
1424
987
|
optimizer=self.__class__.__name__,
|
1425
988
|
prompt=final_best_prompt.get_messages(),
|
@@ -1430,357 +993,117 @@ Return only the new prompt list object.
|
|
1430
993
|
details=final_details,
|
1431
994
|
history=[x.model_dump() for x in self.get_history()],
|
1432
995
|
llm_calls=self.llm_call_counter,
|
996
|
+
tool_calls=self.tool_call_counter,
|
1433
997
|
dataset_id=dataset.id,
|
1434
998
|
optimization_id=self._current_optimization_id,
|
999
|
+
tool_prompts=tool_prompts,
|
1435
1000
|
)
|
1436
1001
|
|
1437
|
-
|
1438
|
-
def _call_model(
|
1439
|
-
self,
|
1440
|
-
messages: List[Dict[str, str]],
|
1441
|
-
is_reasoning: bool = False,
|
1442
|
-
optimization_id: Optional[str] = None,
|
1443
|
-
) -> str:
|
1444
|
-
"""Call the model with the given prompt and return the response."""
|
1445
|
-
try:
|
1446
|
-
# Basic LLM parameters
|
1447
|
-
llm_config_params = {
|
1448
|
-
"temperature": getattr(self, "temperature", 0.3),
|
1449
|
-
"max_tokens": getattr(self, "max_tokens", 1000),
|
1450
|
-
"top_p": getattr(self, "top_p", 1.0),
|
1451
|
-
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
1452
|
-
"presence_penalty": getattr(self, "presence_penalty", 0.0),
|
1453
|
-
}
|
1454
|
-
|
1455
|
-
# Prepare metadata for opik
|
1456
|
-
metadata_for_opik: Dict[str, Any] = {}
|
1457
|
-
if self.project_name:
|
1458
|
-
metadata_for_opik["project_name"] = self.project_name
|
1459
|
-
metadata_for_opik["opik"] = {"project_name": self.project_name}
|
1460
|
-
|
1461
|
-
if optimization_id:
|
1462
|
-
if "opik" in metadata_for_opik:
|
1463
|
-
metadata_for_opik["opik"]["optimization_id"] = optimization_id
|
1464
|
-
|
1465
|
-
metadata_for_opik["optimizer_name"] = self.__class__.__name__
|
1466
|
-
metadata_for_opik["opik_call_type"] = (
|
1467
|
-
"reasoning" if is_reasoning else "evaluation_llm_task_direct"
|
1468
|
-
)
|
1469
|
-
|
1470
|
-
if metadata_for_opik:
|
1471
|
-
llm_config_params["metadata"] = metadata_for_opik
|
1472
|
-
|
1473
|
-
# Pass llm_config_params to the Opik monitor
|
1474
|
-
final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
|
1475
|
-
llm_config_params.copy()
|
1476
|
-
)
|
1477
|
-
|
1478
|
-
logger.debug(
|
1479
|
-
f"Calling model '{self.model}' with messages: {messages}, "
|
1480
|
-
f"final params for litellm (from monitor): {final_call_params}"
|
1481
|
-
)
|
1482
|
-
|
1483
|
-
response = litellm.completion(
|
1484
|
-
model=self.model, messages=messages, **final_call_params
|
1485
|
-
)
|
1486
|
-
self.llm_call_counter += 1
|
1487
|
-
|
1488
|
-
logger.debug(f"Response: {response}")
|
1489
|
-
return response.choices[0].message.content
|
1490
|
-
except litellm_exceptions.RateLimitError as e:
|
1491
|
-
logger.error(f"LiteLLM Rate Limit Error: {e}")
|
1492
|
-
raise
|
1493
|
-
except litellm_exceptions.APIConnectionError as e:
|
1494
|
-
logger.error(f"LiteLLM API Connection Error: {e}")
|
1495
|
-
raise
|
1496
|
-
except litellm_exceptions.ContextWindowExceededError as e:
|
1497
|
-
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
1498
|
-
raise
|
1499
|
-
except Exception as e:
|
1500
|
-
logger.error(
|
1501
|
-
f"Error calling model '{self.model}': {type(e).__name__} - {e}"
|
1502
|
-
)
|
1503
|
-
raise
|
1504
|
-
|
1505
|
-
def _evaluate_prompt(
|
1002
|
+
def optimize_mcp(
|
1506
1003
|
self,
|
1507
1004
|
prompt: chat_prompt.ChatPrompt,
|
1508
|
-
messages: List[Dict[str, str]],
|
1509
1005
|
dataset: opik.Dataset,
|
1510
1006
|
metric: Callable,
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1007
|
+
*,
|
1008
|
+
tool_name: str,
|
1009
|
+
second_pass: MCPSecondPassCoordinator,
|
1010
|
+
experiment_config: dict | None = None,
|
1011
|
+
n_samples: int | None = None,
|
1012
|
+
auto_continue: bool = False,
|
1013
|
+
agent_class: type[OptimizableAgent] | None = None,
|
1014
|
+
fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
|
1015
|
+
fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
|
1016
|
+
allow_tool_use_on_second_pass: bool = False,
|
1516
1017
|
**kwargs: Any,
|
1517
|
-
) ->
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
"
|
1545
|
-
"
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
|
1562
|
-
# print("MESSAGES:", new_prompt.messages)
|
1563
|
-
messages = new_prompt.get_messages(dataset_item)
|
1564
|
-
model_output = agent.invoke(messages)
|
1565
|
-
# print("OUTPUT:", model_output)
|
1566
|
-
return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
|
1567
|
-
|
1568
|
-
# Evaluate the prompt
|
1569
|
-
score = task_evaluator.evaluate(
|
1570
|
-
dataset=dataset,
|
1571
|
-
dataset_item_ids=dataset_item_ids,
|
1572
|
-
metric=metric,
|
1573
|
-
evaluated_task=llm_task,
|
1574
|
-
num_threads=self.num_threads,
|
1575
|
-
project_name=experiment_config["project_name"],
|
1576
|
-
n_samples=n_samples if dataset_item_ids is None else None,
|
1577
|
-
experiment_config=experiment_config,
|
1578
|
-
optimization_id=optimization_id,
|
1579
|
-
verbose=verbose,
|
1580
|
-
)
|
1581
|
-
return score
|
1582
|
-
|
1583
|
-
def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
|
1584
|
-
"""Perform crossover by asking an LLM to blend two parent prompts."""
|
1585
|
-
reporting.display_message(
|
1586
|
-
" Recombining prompts using an LLM.", verbose=self.verbose
|
1018
|
+
) -> OptimizationResult:
|
1019
|
+
if prompt.tools is None or not prompt.tools:
|
1020
|
+
raise ValueError("Prompt must include tools for MCP optimization")
|
1021
|
+
|
1022
|
+
panel_style = kwargs.pop("tool_panel_style", "bright_magenta")
|
1023
|
+
|
1024
|
+
segments = extract_prompt_segments(prompt)
|
1025
|
+
tool_segment_id = f"tool:{tool_name}"
|
1026
|
+
segment_lookup = {segment.segment_id: segment for segment in segments}
|
1027
|
+
if tool_segment_id not in segment_lookup:
|
1028
|
+
raise ValueError(f"Tool '{tool_name}' not present in prompt tools")
|
1029
|
+
|
1030
|
+
fallback_args_fn = fallback_arguments or extract_tool_arguments
|
1031
|
+
|
1032
|
+
if fallback_invoker is None:
|
1033
|
+
function_map = getattr(prompt, "function_map", {}) or {}
|
1034
|
+
default_invoker_candidate = function_map.get(tool_name)
|
1035
|
+
if default_invoker_candidate is not None:
|
1036
|
+
typed_invoker = cast(Callable[..., str], default_invoker_candidate)
|
1037
|
+
|
1038
|
+
def _fallback_invoker(args: dict[str, Any]) -> str:
|
1039
|
+
return typed_invoker(**args)
|
1040
|
+
|
1041
|
+
fallback_invoker = _fallback_invoker
|
1042
|
+
|
1043
|
+
tool_entry = None
|
1044
|
+
for entry in prompt.tools or []:
|
1045
|
+
function = entry.get("function", {})
|
1046
|
+
if (function.get("name") or entry.get("name")) == tool_name:
|
1047
|
+
tool_entry = entry
|
1048
|
+
break
|
1049
|
+
if tool_entry is None:
|
1050
|
+
raise ValueError(f"Tool '{tool_name}' not present in prompt.tools")
|
1051
|
+
|
1052
|
+
original_description = tool_entry.get("function", {}).get("description", "")
|
1053
|
+
tool_metadata = segment_lookup[tool_segment_id].metadata.get("raw_tool", {})
|
1054
|
+
|
1055
|
+
mcp_config = MCPExecutionConfig(
|
1056
|
+
coordinator=second_pass,
|
1057
|
+
tool_name=tool_name,
|
1058
|
+
fallback_arguments=fallback_args_fn,
|
1059
|
+
fallback_invoker=fallback_invoker,
|
1060
|
+
allow_tool_use_on_second_pass=allow_tool_use_on_second_pass,
|
1587
1061
|
)
|
1588
1062
|
|
1589
|
-
|
1590
|
-
|
1591
|
-
current_output_style_guidance = self.output_style_guidance
|
1592
|
-
|
1593
|
-
user_prompt_for_llm_crossover = f"""Parent Prompt 1:
|
1594
|
-
'''{parent1_messages}'''
|
1063
|
+
previous_context = getattr(self, "_mcp_context", None)
|
1064
|
+
previous_crossover = self.enable_llm_crossover
|
1595
1065
|
|
1596
|
-
|
1597
|
-
|
1066
|
+
context = EvolutionaryMCPContext(
|
1067
|
+
tool_name=tool_name,
|
1068
|
+
tool_segment_id=tool_segment_id,
|
1069
|
+
original_description=original_description,
|
1070
|
+
tool_metadata=tool_metadata,
|
1071
|
+
panel_style=panel_style,
|
1072
|
+
)
|
1598
1073
|
|
1599
|
-
|
1074
|
+
self._mcp_context = context
|
1075
|
+
self.enable_llm_crossover = False
|
1600
1076
|
|
1601
|
-
Please generate TWO child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
|
1602
|
-
Follow the instructions provided in the system prompt regarding the JSON output format:
|
1603
|
-
[
|
1604
|
-
[{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_1
|
1605
|
-
[{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_2
|
1606
|
-
]
|
1607
|
-
"""
|
1608
1077
|
try:
|
1609
|
-
|
1610
|
-
|
1078
|
+
result = self.optimize_prompt(
|
1079
|
+
prompt=prompt,
|
1080
|
+
dataset=dataset,
|
1081
|
+
metric=metric,
|
1082
|
+
experiment_config=experiment_config,
|
1083
|
+
n_samples=n_samples,
|
1084
|
+
auto_continue=auto_continue,
|
1085
|
+
agent_class=agent_class,
|
1086
|
+
mcp_config=mcp_config,
|
1087
|
+
**kwargs,
|
1611
1088
|
)
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
"role": "system",
|
1616
|
-
"content": self.get_llm_crossover_system_prompt(),
|
1617
|
-
},
|
1618
|
-
{"role": "user", "content": user_prompt_for_llm_crossover},
|
1619
|
-
],
|
1620
|
-
is_reasoning=True,
|
1621
|
-
)
|
1622
|
-
logger.debug(f"Raw LLM response for crossover: {response_content}")
|
1623
|
-
|
1624
|
-
json_response = utils.json_to_dict(response_content)
|
1625
|
-
if (
|
1626
|
-
not isinstance(json_response, list)
|
1627
|
-
or len(json_response) != 2
|
1628
|
-
or not all(isinstance(cs, list) for cs in json_response)
|
1629
|
-
):
|
1630
|
-
logger.warning(
|
1631
|
-
"LLM Crossover: Malformed or empty children_prompts list. Falling back."
|
1632
|
-
)
|
1633
|
-
raise ValueError("Malformed LLM crossover response")
|
1089
|
+
finally:
|
1090
|
+
self._mcp_context = previous_context
|
1091
|
+
self.enable_llm_crossover = previous_crossover
|
1634
1092
|
|
1635
|
-
|
1636
|
-
|
1093
|
+
finalize_mcp_result(result, context, panel_style)
|
1094
|
+
return result
|
1637
1095
|
|
1638
|
-
|
1639
|
-
f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}..."
|
1640
|
-
)
|
1641
|
-
return creator.Individual(child1), creator.Individual(child2)
|
1642
|
-
|
1643
|
-
except Exception as e:
|
1644
|
-
logger.warning(
|
1645
|
-
f"LLM-driven crossover failed: {e}. Falling back to standard crossover."
|
1646
|
-
)
|
1647
|
-
return self._deap_crossover(ind1, ind2)
|
1096
|
+
# Evaluation is provided by EvaluationOps
|
1648
1097
|
|
1649
|
-
|
1650
|
-
|
1651
|
-
description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
|
1652
|
-
description += f"The original high-level instruction being optimized is: '{prompt.get_messages()}'. "
|
1653
|
-
description += "The goal is to create an effective prompt that guides a language model to perform this task well."
|
1654
|
-
return description
|
1098
|
+
# LLM crossover is provided by CrossoverOps
|
1099
|
+
# Helper provided by Helpers
|
1655
1100
|
|
1101
|
+
# Override prompt builders to centralize strings in prompts.py
|
1656
1102
|
def _get_reasoning_system_prompt_for_variation(self) -> str:
|
1657
|
-
return
|
1658
|
-
|
1659
|
-
For each prompt variation, consider:
|
1660
|
-
1. Different levels of specificity and detail, including significantly more detailed and longer versions.
|
1661
|
-
2. Various ways to structure the instruction, exploring more complex sentence structures and phrasings.
|
1662
|
-
3. Alternative phrasings that maintain the core intent but vary in style and complexity.
|
1663
|
-
4. Different emphasis on key components, potentially elaborating on them.
|
1664
|
-
5. Various ways to express constraints or requirements.
|
1665
|
-
6. Different approaches to clarity and conciseness, but also explore more verbose and explanatory styles.
|
1666
|
-
7. Alternative ways to guide the model's response format.
|
1667
|
-
8. Consider variations that are substantially longer and more descriptive than the original.
|
1668
|
-
|
1669
|
-
The generated prompts should guide a target LLM to produce outputs in the following style: '{self.output_style_guidance}'
|
1670
|
-
|
1671
|
-
Return a JSON array of prompts with the following structure:
|
1672
|
-
{{
|
1673
|
-
"prompts": [
|
1674
|
-
{{
|
1675
|
-
"prompt": "alternative prompt 1",
|
1676
|
-
"strategy": "brief description of the variation strategy used, e.g., 'focused on eliciting specific output style'"
|
1677
|
-
}},
|
1678
|
-
{{
|
1679
|
-
"prompt": "alternative prompt 2",
|
1680
|
-
"strategy": "brief description of the variation strategy used"
|
1681
|
-
}}
|
1682
|
-
]
|
1683
|
-
}}
|
1684
|
-
Each prompt variation should aim to get the target LLM to produce answers matching the desired style: '{self.output_style_guidance}'.
|
1685
|
-
"""
|
1103
|
+
return evo_prompts.variation_system_prompt(self.output_style_guidance)
|
1686
1104
|
|
1687
1105
|
def get_llm_crossover_system_prompt(self) -> str:
|
1688
|
-
return
|
1689
|
-
Given two parent prompts, your task is to generate one or two new child prompts that effectively combine the strengths, styles, or core ideas of both parents.
|
1690
|
-
The children should be coherent and aim to explore a potentially more effective region of the prompt design space, with a key goal of eliciting responses from the target language model in the following style: '{self.output_style_guidance}'.
|
1691
|
-
|
1692
|
-
Consider the following when generating children:
|
1693
|
-
- Identify the key instructions, constraints, and desired output formats in each parent, paying attention to any hints about desired output style.
|
1694
|
-
- Explore ways to merge these elements such that the resulting prompt strongly guides the target LLM towards the desired output style.
|
1695
|
-
- You can create a child that is a direct blend, or one that takes a primary structure from one parent and incorporates specific elements from the other, always optimizing for clear instruction towards the desired output style.
|
1696
|
-
- If generating two children, try to make them distinct from each other and from the parents, perhaps by emphasizing different aspects of the parental combination that could lead to the desired output style.
|
1697
|
-
|
1698
|
-
All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
|
1699
|
-
|
1700
|
-
Return a JSON object that is a list of both child prompts. Each child prompt is a list of LLM messages. Example:
|
1701
|
-
[
|
1702
|
-
[{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}],
|
1703
|
-
[{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}]
|
1704
|
-
]
|
1705
|
-
|
1706
|
-
|
1707
|
-
"""
|
1106
|
+
return evo_prompts.llm_crossover_system_prompt(self.output_style_guidance)
|
1708
1107
|
|
1709
1108
|
def _get_radical_innovation_system_prompt(self) -> str:
|
1710
|
-
return
|
1711
|
-
Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
|
1712
|
-
Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
|
1713
|
-
Consider clarity, specificity, constraints, and how to best guide the language model for the described task TO PRODUCE OUTPUTS IN THE FOLLOWING STYLE: '{self.output_style_guidance}'.
|
1714
|
-
Return only the new prompt string, with no preamble or explanation.
|
1715
|
-
"""
|
1716
|
-
|
1717
|
-
def _infer_output_style_from_dataset(
|
1718
|
-
self, dataset: opik.Dataset, prompt: chat_prompt.ChatPrompt, n_examples: int = 5
|
1719
|
-
) -> Optional[str]:
|
1720
|
-
"""Analyzes dataset examples to infer the desired output style."""
|
1721
|
-
with reporting.infer_output_style(
|
1722
|
-
verbose=self.verbose
|
1723
|
-
) as report_infer_output_style:
|
1724
|
-
report_infer_output_style.start_style_inference()
|
1725
|
-
|
1726
|
-
try:
|
1727
|
-
items_to_process = dataset.get_items(n_examples)
|
1728
|
-
except Exception as e:
|
1729
|
-
report_infer_output_style.error(
|
1730
|
-
f"Failed to get items from dataset '{dataset.name}': {e}"
|
1731
|
-
)
|
1732
|
-
return None
|
1733
|
-
|
1734
|
-
if not items_to_process:
|
1735
|
-
report_infer_output_style.error(
|
1736
|
-
f"Dataset '{dataset.name}' is empty. Cannot infer output style."
|
1737
|
-
)
|
1738
|
-
return None
|
1739
|
-
|
1740
|
-
# Need at least a couple of examples for meaningful inference
|
1741
|
-
if len(items_to_process) < min(n_examples, 2):
|
1742
|
-
report_infer_output_style.error(
|
1743
|
-
f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}."
|
1744
|
-
)
|
1745
|
-
return None
|
1746
|
-
|
1747
|
-
examples_str = ""
|
1748
|
-
for i, item_content in enumerate(items_to_process):
|
1749
|
-
filtered_content = {x: y for x, y in item_content.items() if x != "id"}
|
1750
|
-
examples_str += (
|
1751
|
-
f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
|
1752
|
-
)
|
1753
|
-
|
1754
|
-
user_prompt_for_style_inference = f"""Please analyze the following examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. Before describing the output style, make sure to understand the dataset content and structure as it can include input, output and metadata fields. This description will be used to guide other LLMs in generating and refining prompts.
|
1755
|
-
|
1756
|
-
{examples_str}
|
1757
|
-
|
1758
|
-
Based on these examples, what is the desired output style description?
|
1759
|
-
Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
|
1760
|
-
The description should be a single string that can be directly used as an instruction for another LLM.
|
1761
|
-
Return ONLY this descriptive string.
|
1762
|
-
"""
|
1763
|
-
# report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
|
1764
|
-
|
1765
|
-
try:
|
1766
|
-
inferred_style = self._call_model(
|
1767
|
-
messages=[
|
1768
|
-
{"role": "system", "content": self._INFER_STYLE_SYSTEM_PROMPT},
|
1769
|
-
{"role": "user", "content": user_prompt_for_style_inference},
|
1770
|
-
],
|
1771
|
-
is_reasoning=True,
|
1772
|
-
)
|
1773
|
-
inferred_style = inferred_style.strip()
|
1774
|
-
if inferred_style:
|
1775
|
-
report_infer_output_style.success(inferred_style)
|
1776
|
-
return inferred_style
|
1777
|
-
else:
|
1778
|
-
report_infer_output_style.error(
|
1779
|
-
"LLM returned empty string for inferred output style."
|
1780
|
-
)
|
1781
|
-
return None
|
1782
|
-
except Exception as e:
|
1783
|
-
report_infer_output_style.error(
|
1784
|
-
f"Error during output style inference: {e}"
|
1785
|
-
)
|
1786
|
-
return None
|
1109
|
+
return evo_prompts.radical_innovation_system_prompt(self.output_style_guidance)
|