opik-optimizer 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +28 -11
- opik_optimizer/colbert.py +236 -0
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +152 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +20 -20
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +18 -17
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +21 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/{utils.py → utils/core.py} +111 -26
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
- opik_optimizer-1.1.0.dist-info/RECORD +73 -0
- opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.5.dist-info/RECORD +0 -50
- opik_optimizer-1.0.5.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,10 @@
|
|
1
|
+
from ..optimization_config import chat_prompt
|
2
|
+
|
3
|
+
|
4
|
+
class Helpers:
|
5
|
+
def _get_task_description_for_llm(self, prompt: chat_prompt.ChatPrompt) -> str:
|
6
|
+
"""Generates a concise task description for LLM prompts that need context."""
|
7
|
+
description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
|
8
|
+
description += f"The original high-level instruction being optimized is: '{prompt.get_messages()}'. "
|
9
|
+
description += "The goal is to create an effective prompt that guides a language model to perform this task well."
|
10
|
+
return description
|
@@ -0,0 +1,134 @@
|
|
1
|
+
from typing import Any, TYPE_CHECKING
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import time
|
6
|
+
import random
|
7
|
+
|
8
|
+
import litellm
|
9
|
+
from litellm import exceptions as litellm_exceptions
|
10
|
+
from litellm.caching import Cache
|
11
|
+
from litellm.types.caching import LiteLLMCacheType
|
12
|
+
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
13
|
+
|
14
|
+
from .. import _throttle
|
15
|
+
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
# Configure LiteLLM cache with safe fallback
|
21
|
+
try:
|
22
|
+
# Prefer a disk cache in a user-writable location
|
23
|
+
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "litellm")
|
24
|
+
os.makedirs(cache_dir, exist_ok=True)
|
25
|
+
litellm.cache = Cache(type=LiteLLMCacheType.DISK, cache_dir=cache_dir)
|
26
|
+
except (PermissionError, OSError, FileNotFoundError):
|
27
|
+
# Fall back to in-memory cache to avoid disk timeouts/locks
|
28
|
+
litellm.cache = Cache(type=LiteLLMCacheType.MEMORY)
|
29
|
+
|
30
|
+
_rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
31
|
+
|
32
|
+
|
33
|
+
class LlmSupport:
|
34
|
+
if TYPE_CHECKING:
|
35
|
+
model: str
|
36
|
+
llm_call_counter: int
|
37
|
+
project_name: str | None
|
38
|
+
disable_litellm_monitoring: bool
|
39
|
+
temperature: float
|
40
|
+
max_tokens: int
|
41
|
+
top_p: float
|
42
|
+
frequency_penalty: float
|
43
|
+
presence_penalty: float
|
44
|
+
|
45
|
+
@_throttle.rate_limited(_rate_limiter)
|
46
|
+
def _call_model(
|
47
|
+
self,
|
48
|
+
messages: list[dict[str, str]],
|
49
|
+
is_reasoning: bool = False,
|
50
|
+
optimization_id: str | None = None,
|
51
|
+
) -> str:
|
52
|
+
"""Call the model with the given prompt and return the response string."""
|
53
|
+
# Build base call params
|
54
|
+
llm_config_params: dict[str, Any] = {
|
55
|
+
"temperature": getattr(self, "temperature", 0.3),
|
56
|
+
"max_tokens": getattr(self, "max_tokens", 1000),
|
57
|
+
"top_p": getattr(self, "top_p", 1.0),
|
58
|
+
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
59
|
+
"presence_penalty": getattr(self, "presence_penalty", 0.0),
|
60
|
+
}
|
61
|
+
|
62
|
+
# Add Opik metadata unless disabled
|
63
|
+
try:
|
64
|
+
disable_monitoring_env = os.getenv(
|
65
|
+
"OPIK_OPTIMIZER_DISABLE_LITELLM_MONITORING", "0"
|
66
|
+
)
|
67
|
+
disable_monitoring = getattr(
|
68
|
+
self, "disable_litellm_monitoring", False
|
69
|
+
) or disable_monitoring_env.lower() in ("1", "true", "yes")
|
70
|
+
|
71
|
+
if not disable_monitoring:
|
72
|
+
metadata_for_opik: dict[str, Any] = {}
|
73
|
+
pn = getattr(self, "project_name", None)
|
74
|
+
if pn:
|
75
|
+
metadata_for_opik["project_name"] = pn
|
76
|
+
metadata_for_opik["opik"] = {"project_name": pn}
|
77
|
+
if optimization_id and "opik" in metadata_for_opik:
|
78
|
+
metadata_for_opik["opik"]["optimization_id"] = optimization_id
|
79
|
+
metadata_for_opik["optimizer_name"] = self.__class__.__name__
|
80
|
+
metadata_for_opik["opik_call_type"] = (
|
81
|
+
"reasoning" if is_reasoning else "evaluation_llm_task_direct"
|
82
|
+
)
|
83
|
+
if metadata_for_opik:
|
84
|
+
llm_config_params["metadata"] = metadata_for_opik
|
85
|
+
|
86
|
+
# Try to add Opik monitoring callbacks; fall back silently on failure
|
87
|
+
llm_config_params = (
|
88
|
+
opik_litellm_monitor.try_add_opik_monitoring_to_params( # type: ignore
|
89
|
+
llm_config_params.copy()
|
90
|
+
)
|
91
|
+
)
|
92
|
+
except Exception as e:
|
93
|
+
logger.debug(f"Skipping Opik-LiteLLM monitoring setup: {e}")
|
94
|
+
|
95
|
+
# Retry policy for transient errors
|
96
|
+
max_retries = int(os.getenv("OPIK_OPTIMIZER_LITELLM_MAX_RETRIES", "3"))
|
97
|
+
base_sleep = float(os.getenv("OPIK_OPTIMIZER_LITELLM_BACKOFF", "0.5"))
|
98
|
+
|
99
|
+
for attempt in range(max_retries + 1):
|
100
|
+
try:
|
101
|
+
logger.debug(
|
102
|
+
f"Calling model '{self.model}' with messages: {messages}, params: {llm_config_params} (attempt {attempt + 1})"
|
103
|
+
)
|
104
|
+
response = litellm.completion(
|
105
|
+
model=self.model, messages=messages, **llm_config_params
|
106
|
+
)
|
107
|
+
self.llm_call_counter += 1
|
108
|
+
return response.choices[0].message.content
|
109
|
+
except (
|
110
|
+
litellm_exceptions.RateLimitError,
|
111
|
+
litellm_exceptions.APIConnectionError,
|
112
|
+
litellm_exceptions.InternalServerError,
|
113
|
+
) as e:
|
114
|
+
if attempt < max_retries:
|
115
|
+
sleep_s = min(10.0, base_sleep * (2**attempt)) + random.uniform(
|
116
|
+
0, 0.25
|
117
|
+
)
|
118
|
+
logger.warning(
|
119
|
+
f"LiteLLM transient error ({type(e).__name__}): {e}. Retrying in {sleep_s:.2f}s..."
|
120
|
+
)
|
121
|
+
time.sleep(sleep_s)
|
122
|
+
continue
|
123
|
+
logger.error(f"LiteLLM error (final attempt): {e}")
|
124
|
+
raise
|
125
|
+
except litellm_exceptions.ContextWindowExceededError as e:
|
126
|
+
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
127
|
+
raise
|
128
|
+
except Exception as e:
|
129
|
+
logger.error(
|
130
|
+
f"Error calling model '{self.model}': {type(e).__name__} - {e}"
|
131
|
+
)
|
132
|
+
raise
|
133
|
+
# Should never reach here
|
134
|
+
raise RuntimeError("LLM call did not return a response and did not raise")
|
@@ -0,0 +1,292 @@
|
|
1
|
+
from typing import Any, TYPE_CHECKING
|
2
|
+
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import random
|
6
|
+
|
7
|
+
from . import prompts as evo_prompts
|
8
|
+
from ..optimization_config import chat_prompt
|
9
|
+
from .. import utils
|
10
|
+
from . import reporting
|
11
|
+
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class MutationOps:
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
_calculate_population_diversity: Any
|
19
|
+
DEFAULT_DIVERSITY_THRESHOLD: float
|
20
|
+
verbose: int
|
21
|
+
output_style_guidance: str
|
22
|
+
_get_task_description_for_llm: Any
|
23
|
+
_call_model: Any
|
24
|
+
|
25
|
+
def _deap_mutation(
|
26
|
+
self, individual: Any, initial_prompt: chat_prompt.ChatPrompt
|
27
|
+
) -> Any:
|
28
|
+
"""Enhanced mutation operation with multiple strategies."""
|
29
|
+
prompt = chat_prompt.ChatPrompt(messages=individual)
|
30
|
+
|
31
|
+
# Choose mutation strategy based on current diversity
|
32
|
+
diversity = self._calculate_population_diversity()
|
33
|
+
|
34
|
+
# Determine thresholds based on diversity
|
35
|
+
if diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
|
36
|
+
# Low diversity - use more aggressive mutations (higher chance for semantic)
|
37
|
+
semantic_threshold = 0.5
|
38
|
+
structural_threshold = 0.8 # semantic_threshold + 0.3
|
39
|
+
else:
|
40
|
+
# Good diversity - use more conservative mutations (higher chance for word_level)
|
41
|
+
semantic_threshold = 0.4
|
42
|
+
structural_threshold = 0.7 # semantic_threshold + 0.3
|
43
|
+
|
44
|
+
mutation_choice = random.random()
|
45
|
+
|
46
|
+
if mutation_choice > structural_threshold:
|
47
|
+
mutated_prompt = self._word_level_mutation_prompt(prompt)
|
48
|
+
reporting.display_success(
|
49
|
+
" Mutation successful, prompt has been edited by randomizing words (word-level mutation).",
|
50
|
+
verbose=self.verbose,
|
51
|
+
)
|
52
|
+
return type(individual)(mutated_prompt.get_messages())
|
53
|
+
elif mutation_choice > semantic_threshold:
|
54
|
+
mutated_prompt = self._structural_mutation(prompt)
|
55
|
+
reporting.display_success(
|
56
|
+
" Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).",
|
57
|
+
verbose=self.verbose,
|
58
|
+
)
|
59
|
+
return type(individual)(mutated_prompt.get_messages())
|
60
|
+
else:
|
61
|
+
mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
|
62
|
+
reporting.display_success(
|
63
|
+
" Mutation successful, prompt has been edited using an LLM (semantic mutation).",
|
64
|
+
verbose=self.verbose,
|
65
|
+
)
|
66
|
+
return type(individual)(mutated_prompt.get_messages())
|
67
|
+
|
68
|
+
def _semantic_mutation(
|
69
|
+
self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
|
70
|
+
) -> chat_prompt.ChatPrompt:
|
71
|
+
"""Enhanced semantic mutation with multiple strategies."""
|
72
|
+
current_output_style_guidance = self.output_style_guidance
|
73
|
+
if random.random() < 0.1:
|
74
|
+
return self._radical_innovation_mutation(prompt, initial_prompt)
|
75
|
+
|
76
|
+
try:
|
77
|
+
strategy = random.choice(
|
78
|
+
[
|
79
|
+
"rephrase",
|
80
|
+
"simplify",
|
81
|
+
"elaborate",
|
82
|
+
"restructure",
|
83
|
+
"focus",
|
84
|
+
"increase_complexity_and_detail",
|
85
|
+
]
|
86
|
+
)
|
87
|
+
|
88
|
+
strategy_prompts = evo_prompts.mutation_strategy_prompts(
|
89
|
+
current_output_style_guidance
|
90
|
+
)
|
91
|
+
user_prompt_for_semantic_mutation = (
|
92
|
+
evo_prompts.semantic_mutation_user_prompt(
|
93
|
+
prompt.get_messages(),
|
94
|
+
self._get_task_description_for_llm(initial_prompt),
|
95
|
+
current_output_style_guidance,
|
96
|
+
strategy_prompts[strategy],
|
97
|
+
)
|
98
|
+
)
|
99
|
+
response = self._call_model(
|
100
|
+
messages=[
|
101
|
+
{
|
102
|
+
"role": "system",
|
103
|
+
"content": evo_prompts.semantic_mutation_system_prompt(
|
104
|
+
current_output_style_guidance
|
105
|
+
),
|
106
|
+
},
|
107
|
+
{"role": "user", "content": user_prompt_for_semantic_mutation},
|
108
|
+
],
|
109
|
+
is_reasoning=True,
|
110
|
+
)
|
111
|
+
|
112
|
+
try:
|
113
|
+
messages = utils.json_to_dict(response.strip())
|
114
|
+
except Exception as parse_exc:
|
115
|
+
raise RuntimeError(
|
116
|
+
f"Error parsing semantic mutation response as JSON. "
|
117
|
+
f"Response: {response!r}\nOriginal error: {parse_exc}"
|
118
|
+
) from parse_exc
|
119
|
+
return chat_prompt.ChatPrompt(messages=messages)
|
120
|
+
except Exception as e:
|
121
|
+
reporting.display_error(
|
122
|
+
f" Error in semantic mutation, this is usually a parsing error: {e}",
|
123
|
+
verbose=self.verbose,
|
124
|
+
)
|
125
|
+
return prompt
|
126
|
+
|
127
|
+
def _structural_mutation(
|
128
|
+
self, prompt: chat_prompt.ChatPrompt
|
129
|
+
) -> chat_prompt.ChatPrompt:
|
130
|
+
"""Perform structural mutation (reordering, combining, splitting)."""
|
131
|
+
mutated_messages: list[dict[str, str]] = []
|
132
|
+
|
133
|
+
for message in prompt.get_messages():
|
134
|
+
content = message["content"]
|
135
|
+
role = message["role"]
|
136
|
+
|
137
|
+
sentences = [s.strip() for s in content.split(".") if s.strip()]
|
138
|
+
if len(sentences) <= 1:
|
139
|
+
mutated_messages.append(
|
140
|
+
{"role": role, "content": self._word_level_mutation(content)}
|
141
|
+
)
|
142
|
+
continue
|
143
|
+
|
144
|
+
mutation_type = random.random()
|
145
|
+
if mutation_type < 0.3:
|
146
|
+
random.shuffle(sentences)
|
147
|
+
mutated_messages.append(
|
148
|
+
{"role": role, "content": ". ".join(sentences) + "."}
|
149
|
+
)
|
150
|
+
continue
|
151
|
+
elif mutation_type < 0.6:
|
152
|
+
if len(sentences) >= 2:
|
153
|
+
idx = random.randint(0, len(sentences) - 2)
|
154
|
+
combined = sentences[idx] + " and " + sentences[idx + 1]
|
155
|
+
sentences[idx : idx + 2] = [combined]
|
156
|
+
mutated_messages.append(
|
157
|
+
{"role": role, "content": ". ".join(sentences) + "."}
|
158
|
+
)
|
159
|
+
continue
|
160
|
+
else:
|
161
|
+
idx = random.randint(0, len(sentences) - 1)
|
162
|
+
words = sentences[idx].split()
|
163
|
+
if len(words) > 3:
|
164
|
+
split_point = random.randint(2, len(words) - 2)
|
165
|
+
sentences[idx : idx + 1] = [
|
166
|
+
" ".join(words[:split_point]),
|
167
|
+
" ".join(words[split_point:]),
|
168
|
+
]
|
169
|
+
mutated_messages.append(
|
170
|
+
{"role": role, "content": ". ".join(sentences) + "."}
|
171
|
+
)
|
172
|
+
continue
|
173
|
+
else:
|
174
|
+
mutated_messages.append({"role": role, "content": content})
|
175
|
+
|
176
|
+
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
177
|
+
|
178
|
+
def _word_level_mutation_prompt(
|
179
|
+
self, prompt: chat_prompt.ChatPrompt
|
180
|
+
) -> chat_prompt.ChatPrompt:
|
181
|
+
mutated_messages: list[dict[str, str]] = []
|
182
|
+
for message in prompt.get_messages():
|
183
|
+
mutated_messages.append(
|
184
|
+
{
|
185
|
+
"role": message["role"],
|
186
|
+
"content": self._word_level_mutation(message["content"]),
|
187
|
+
}
|
188
|
+
)
|
189
|
+
return chat_prompt.ChatPrompt(messages=mutated_messages)
|
190
|
+
|
191
|
+
def _word_level_mutation(self, msg_content: str) -> str:
|
192
|
+
"""Perform word-level mutation."""
|
193
|
+
words = msg_content.split()
|
194
|
+
if len(words) <= 1:
|
195
|
+
return msg_content
|
196
|
+
|
197
|
+
mutation_type = random.random()
|
198
|
+
if mutation_type < 0.3:
|
199
|
+
idx = random.randint(0, len(words) - 1)
|
200
|
+
words[idx] = self._get_synonym(words[idx])
|
201
|
+
elif mutation_type < 0.6:
|
202
|
+
if len(words) > 2:
|
203
|
+
i, j = random.sample(range(len(words)), 2)
|
204
|
+
words[i], words[j] = words[j], words[i]
|
205
|
+
else:
|
206
|
+
idx = random.randint(0, len(words) - 1)
|
207
|
+
words[idx] = self._modify_phrase(words[idx])
|
208
|
+
|
209
|
+
return " ".join(words)
|
210
|
+
|
211
|
+
def _get_synonym(self, word: str) -> str:
|
212
|
+
"""Get a synonym for a word using LLM."""
|
213
|
+
try:
|
214
|
+
response = self._call_model(
|
215
|
+
messages=[
|
216
|
+
{"role": "system", "content": evo_prompts.synonyms_system_prompt()},
|
217
|
+
{
|
218
|
+
"role": "user",
|
219
|
+
"content": (
|
220
|
+
f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else."
|
221
|
+
),
|
222
|
+
},
|
223
|
+
],
|
224
|
+
is_reasoning=True,
|
225
|
+
)
|
226
|
+
return response.strip()
|
227
|
+
except Exception as e:
|
228
|
+
logger.warning(f"Error getting synonym for '{word}': {e}")
|
229
|
+
return word
|
230
|
+
|
231
|
+
def _modify_phrase(self, phrase: str) -> str:
|
232
|
+
"""Modify a phrase while preserving meaning using LLM."""
|
233
|
+
try:
|
234
|
+
response = self._call_model(
|
235
|
+
messages=[
|
236
|
+
{"role": "system", "content": evo_prompts.rephrase_system_prompt()},
|
237
|
+
{
|
238
|
+
"role": "user",
|
239
|
+
"content": (
|
240
|
+
f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else."
|
241
|
+
),
|
242
|
+
},
|
243
|
+
],
|
244
|
+
is_reasoning=True,
|
245
|
+
)
|
246
|
+
return response.strip()
|
247
|
+
except Exception as e:
|
248
|
+
logger.warning(f"Error modifying phrase '{phrase}': {e}")
|
249
|
+
return phrase
|
250
|
+
|
251
|
+
def _radical_innovation_mutation(
|
252
|
+
self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
|
253
|
+
) -> chat_prompt.ChatPrompt:
|
254
|
+
"""Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
|
255
|
+
logger.debug(
|
256
|
+
f"Attempting radical innovation for prompt: {json.dumps(prompt.get_messages())[:70]}..."
|
257
|
+
)
|
258
|
+
task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
|
259
|
+
current_output_style_guidance = self.output_style_guidance
|
260
|
+
|
261
|
+
user_prompt_for_radical_innovation = evo_prompts.radical_innovation_user_prompt(
|
262
|
+
task_desc_for_llm, current_output_style_guidance, prompt.get_messages()
|
263
|
+
)
|
264
|
+
try:
|
265
|
+
new_prompt_str = self._call_model(
|
266
|
+
messages=[
|
267
|
+
{
|
268
|
+
"role": "system",
|
269
|
+
"content": evo_prompts.radical_innovation_system_prompt(
|
270
|
+
current_output_style_guidance
|
271
|
+
),
|
272
|
+
},
|
273
|
+
{"role": "user", "content": user_prompt_for_radical_innovation},
|
274
|
+
],
|
275
|
+
is_reasoning=True,
|
276
|
+
)
|
277
|
+
logger.info(
|
278
|
+
f"Radical innovation LLM result (truncated): {new_prompt_str[:200]}"
|
279
|
+
)
|
280
|
+
try:
|
281
|
+
new_messages = utils.json_to_dict(new_prompt_str)
|
282
|
+
except Exception as parse_exc:
|
283
|
+
logger.warning(
|
284
|
+
f"Failed to parse LLM output in radical innovation mutation for prompt '{json.dumps(prompt.get_messages())[:50]}...'. Output: {new_prompt_str[:200]}. Error: {parse_exc}. Returning original."
|
285
|
+
)
|
286
|
+
return prompt
|
287
|
+
return chat_prompt.ChatPrompt(messages=new_messages)
|
288
|
+
except Exception as e:
|
289
|
+
logger.warning(
|
290
|
+
f"Radical innovation mutation failed for prompt '{json.dumps(prompt.get_messages())[:50]}...': {e}. Returning original."
|
291
|
+
)
|
292
|
+
return prompt
|
@@ -0,0 +1,223 @@
|
|
1
|
+
from typing import Any, TYPE_CHECKING
|
2
|
+
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
|
6
|
+
from deap import tools
|
7
|
+
from deap import creator as _creator
|
8
|
+
|
9
|
+
from . import prompts as evo_prompts
|
10
|
+
from . import reporting
|
11
|
+
from ..optimization_config import chat_prompt
|
12
|
+
from .. import utils
|
13
|
+
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
creator = _creator
|
17
|
+
|
18
|
+
|
19
|
+
class PopulationOps:
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
_get_task_description_for_llm: Any
|
22
|
+
output_style_guidance: str
|
23
|
+
_call_model: Any
|
24
|
+
toolbox: Any
|
25
|
+
# Hints for mixin attributes provided by the primary optimizer class
|
26
|
+
_gens_since_pop_improvement: int
|
27
|
+
_best_primary_score_history: list[float]
|
28
|
+
DEFAULT_RESTART_THRESHOLD: float
|
29
|
+
DEFAULT_RESTART_GENERATIONS: int
|
30
|
+
enable_moo: bool
|
31
|
+
elitism_size: int
|
32
|
+
population_size: int
|
33
|
+
verbose: int
|
34
|
+
|
35
|
+
def _initialize_population(
|
36
|
+
self, prompt: chat_prompt.ChatPrompt
|
37
|
+
) -> list[chat_prompt.ChatPrompt]:
|
38
|
+
"""Initialize the population with diverse variations of the initial prompt,
|
39
|
+
including some 'fresh start' prompts based purely on task description.
|
40
|
+
All generated prompts should aim to elicit answers matching self.output_style_guidance.
|
41
|
+
"""
|
42
|
+
with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
|
43
|
+
init_pop_report.start(self.population_size)
|
44
|
+
|
45
|
+
population = [prompt]
|
46
|
+
if self.population_size <= 1:
|
47
|
+
return population
|
48
|
+
|
49
|
+
num_to_generate_total = self.population_size - 1
|
50
|
+
num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
|
51
|
+
num_variations_on_initial = num_to_generate_total - num_fresh_starts
|
52
|
+
|
53
|
+
task_desc_for_llm = self._get_task_description_for_llm(prompt)
|
54
|
+
current_output_style_guidance = self.output_style_guidance
|
55
|
+
|
56
|
+
# Fresh starts
|
57
|
+
if num_fresh_starts > 0:
|
58
|
+
init_pop_report.start_fresh_prompts(num_fresh_starts)
|
59
|
+
fresh_start_user_prompt = evo_prompts.fresh_start_user_prompt(
|
60
|
+
task_desc_for_llm, current_output_style_guidance, num_fresh_starts
|
61
|
+
)
|
62
|
+
try:
|
63
|
+
response_content = self._call_model(
|
64
|
+
messages=[
|
65
|
+
{
|
66
|
+
"role": "system",
|
67
|
+
"content": evo_prompts.fresh_start_system_prompt(
|
68
|
+
current_output_style_guidance
|
69
|
+
),
|
70
|
+
},
|
71
|
+
{"role": "user", "content": fresh_start_user_prompt},
|
72
|
+
],
|
73
|
+
is_reasoning=True,
|
74
|
+
)
|
75
|
+
|
76
|
+
logger.debug(
|
77
|
+
f"Raw LLM response for fresh start prompts: {response_content}"
|
78
|
+
)
|
79
|
+
|
80
|
+
fresh_prompts = utils.json_to_dict(response_content)
|
81
|
+
if isinstance(fresh_prompts, list):
|
82
|
+
if all(isinstance(p, dict) for p in fresh_prompts) and all(
|
83
|
+
p.get("role") is not None for p in fresh_prompts
|
84
|
+
):
|
85
|
+
population.append(
|
86
|
+
chat_prompt.ChatPrompt(messages=fresh_prompts)
|
87
|
+
)
|
88
|
+
init_pop_report.success_fresh_prompts(1)
|
89
|
+
elif all(isinstance(p, list) for p in fresh_prompts):
|
90
|
+
population.extend(
|
91
|
+
[
|
92
|
+
chat_prompt.ChatPrompt(messages=p)
|
93
|
+
for p in fresh_prompts[:num_fresh_starts]
|
94
|
+
]
|
95
|
+
)
|
96
|
+
init_pop_report.success_fresh_prompts(
|
97
|
+
len(fresh_prompts[:num_fresh_starts])
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
init_pop_report.failed_fresh_prompts(
|
101
|
+
num_fresh_starts,
|
102
|
+
f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts.",
|
103
|
+
)
|
104
|
+
except json.JSONDecodeError as e_json:
|
105
|
+
init_pop_report.failed_fresh_prompts(
|
106
|
+
num_fresh_starts,
|
107
|
+
f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts.",
|
108
|
+
)
|
109
|
+
except Exception as e:
|
110
|
+
init_pop_report.failed_fresh_prompts(
|
111
|
+
num_fresh_starts,
|
112
|
+
f"Error generating fresh start prompts: {e}. Skipping fresh start prompts.",
|
113
|
+
)
|
114
|
+
|
115
|
+
# Variations on the initial prompt
|
116
|
+
if num_variations_on_initial > 0:
|
117
|
+
init_pop_report.start_variations(num_variations_on_initial)
|
118
|
+
user_prompt_for_variation = evo_prompts.variation_user_prompt(
|
119
|
+
prompt.get_messages(),
|
120
|
+
task_desc_for_llm,
|
121
|
+
current_output_style_guidance,
|
122
|
+
num_variations_on_initial,
|
123
|
+
)
|
124
|
+
try:
|
125
|
+
response_content_variations = self._call_model(
|
126
|
+
messages=[
|
127
|
+
{
|
128
|
+
"role": "system",
|
129
|
+
"content": evo_prompts.variation_system_prompt(
|
130
|
+
current_output_style_guidance
|
131
|
+
),
|
132
|
+
},
|
133
|
+
{"role": "user", "content": user_prompt_for_variation},
|
134
|
+
],
|
135
|
+
is_reasoning=True,
|
136
|
+
)
|
137
|
+
logger.debug(
|
138
|
+
f"Raw response for population variations: {response_content_variations}"
|
139
|
+
)
|
140
|
+
json_response_variations = json.loads(response_content_variations)
|
141
|
+
generated_prompts_variations = [
|
142
|
+
p["prompt"]
|
143
|
+
for p in json_response_variations.get("prompts", [])
|
144
|
+
if isinstance(p, dict) and "prompt" in p
|
145
|
+
]
|
146
|
+
|
147
|
+
if generated_prompts_variations:
|
148
|
+
init_pop_report.success_variations(
|
149
|
+
len(
|
150
|
+
generated_prompts_variations[:num_variations_on_initial]
|
151
|
+
)
|
152
|
+
)
|
153
|
+
population.extend(
|
154
|
+
[
|
155
|
+
chat_prompt.ChatPrompt(messages=p)
|
156
|
+
for p in generated_prompts_variations[
|
157
|
+
:num_variations_on_initial
|
158
|
+
]
|
159
|
+
]
|
160
|
+
)
|
161
|
+
else:
|
162
|
+
init_pop_report.failed_variations(
|
163
|
+
num_variations_on_initial,
|
164
|
+
"Could not parse 'prompts' list for variations. Skipping variations.",
|
165
|
+
)
|
166
|
+
except Exception as e:
|
167
|
+
init_pop_report.failed_variations(
|
168
|
+
num_variations_on_initial,
|
169
|
+
f"Error calling LLM for initial population variations: {e}",
|
170
|
+
)
|
171
|
+
|
172
|
+
# Ensure population is of the required size using unique prompts
|
173
|
+
final_population_set: set[str] = set()
|
174
|
+
final_population_list: list[chat_prompt.ChatPrompt] = []
|
175
|
+
for p in population:
|
176
|
+
if json.dumps(p.get_messages()) not in final_population_set:
|
177
|
+
final_population_set.add(json.dumps(p.get_messages()))
|
178
|
+
final_population_list.append(p)
|
179
|
+
|
180
|
+
init_pop_report.end(final_population_list)
|
181
|
+
return final_population_list[: self.population_size]
|
182
|
+
|
183
|
+
def _should_restart_population(self, curr_best: float) -> bool:
|
184
|
+
"""Update internal counters and decide if we should trigger a population restart."""
|
185
|
+
if self._best_primary_score_history:
|
186
|
+
threshold = self._best_primary_score_history[-1] * (
|
187
|
+
1 + self.DEFAULT_RESTART_THRESHOLD
|
188
|
+
)
|
189
|
+
if curr_best < threshold:
|
190
|
+
self._gens_since_pop_improvement += 1 # type: ignore[attr-defined]
|
191
|
+
else:
|
192
|
+
self._gens_since_pop_improvement = 0 # type: ignore[attr-defined]
|
193
|
+
self._best_primary_score_history.append(curr_best)
|
194
|
+
return self._gens_since_pop_improvement >= self.DEFAULT_RESTART_GENERATIONS # type: ignore[attr-defined]
|
195
|
+
|
196
|
+
def _restart_population(
|
197
|
+
self,
|
198
|
+
hof: tools.HallOfFame,
|
199
|
+
population: list[Any],
|
200
|
+
best_prompt_so_far: chat_prompt.ChatPrompt,
|
201
|
+
) -> list[Any]:
|
202
|
+
"""Return a fresh, evaluated population seeded by elites."""
|
203
|
+
if self.enable_moo:
|
204
|
+
elites = list(hof)
|
205
|
+
else:
|
206
|
+
elites = tools.selBest(population, self.elitism_size)
|
207
|
+
|
208
|
+
seed_prompt = (
|
209
|
+
chat_prompt.ChatPrompt(
|
210
|
+
messages=max(elites, key=lambda x: x.fitness.values[0])
|
211
|
+
)
|
212
|
+
if elites
|
213
|
+
else best_prompt_so_far
|
214
|
+
)
|
215
|
+
|
216
|
+
prompt_variants = self._initialize_population(seed_prompt)
|
217
|
+
new_pop = [creator.Individual(p.get_messages()) for p in prompt_variants]
|
218
|
+
|
219
|
+
for ind, fit in zip(new_pop, map(self.toolbox.evaluate, new_pop)):
|
220
|
+
ind.fitness.values = fit
|
221
|
+
|
222
|
+
self._gens_since_pop_improvement = 0 # type: ignore[attr-defined]
|
223
|
+
return new_pop
|