opik-optimizer 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +28 -11
  4. opik_optimizer/colbert.py +236 -0
  5. opik_optimizer/data/context7_eval.jsonl +3 -0
  6. opik_optimizer/datasets/context7_eval.py +90 -0
  7. opik_optimizer/datasets/tiny_test.py +33 -34
  8. opik_optimizer/datasets/truthful_qa.py +2 -2
  9. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  10. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
  11. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
  12. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  13. opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +152 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +20 -20
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +16 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +21 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/{utils.py → utils/core.py} +111 -26
  46. opik_optimizer/utils/dataset_utils.py +49 -0
  47. opik_optimizer/utils/prompt_segments.py +186 -0
  48. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
  49. opik_optimizer-1.1.0.dist-info/RECORD +73 -0
  50. opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
  51. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  52. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  53. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
  54. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,39 @@
1
1
  import json
2
2
  import logging
3
- import os
4
3
  import random
5
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, cast, Type
4
+ from typing import Any, cast, TYPE_CHECKING
5
+ from collections.abc import Callable
6
+ import sys
7
+ import warnings
6
8
 
7
9
  import rapidfuzz.distance.Indel
8
- import litellm
9
10
  import numpy as np
10
11
  import opik
11
12
 
12
13
  # DEAP imports
13
14
  from deap import base, tools
14
15
  from deap import creator as _creator
15
- from litellm import exceptions as litellm_exceptions
16
- from litellm.caching import Cache
17
- from litellm.types.caching import LiteLLMCacheType
18
16
  from opik.api_objects import opik_client, optimization
19
17
  from opik.environment import get_tqdm_for_current_environment
20
- from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
21
18
 
22
- from opik_optimizer import _throttle, task_evaluator
23
19
  from opik_optimizer.base_optimizer import BaseOptimizer, OptimizationRound
24
- from opik_optimizer.optimization_config import chat_prompt, mappers
20
+ from opik_optimizer.optimization_config import chat_prompt
25
21
  from opik_optimizer.optimization_result import OptimizationResult
26
22
  from opik_optimizer.optimizable_agent import OptimizableAgent
27
23
 
28
24
  from .. import utils
29
25
  from . import reporting
26
+ from .llm_support import LlmSupport
27
+ from .mutation_ops import MutationOps
28
+ from .crossover_ops import CrossoverOps
29
+ from .population_ops import PopulationOps
30
+ from .evaluation_ops import EvaluationOps
31
+ from .helpers import Helpers
32
+ from .style_ops import StyleOps
33
+ from . import prompts as evo_prompts
30
34
 
31
35
  logger = logging.getLogger(__name__)
32
36
  tqdm = get_tqdm_for_current_environment()
33
- _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
34
-
35
- # Using disk cache for LLM calls
36
- disk_cache_dir = os.path.expanduser("~/.litellm_cache")
37
- litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
38
37
 
39
38
  creator = cast(Any, _creator) # type: ignore[assignment]
40
39
 
@@ -80,19 +79,14 @@ class EvolutionaryOptimizer(BaseOptimizer):
80
79
  )
81
80
  DEFAULT_MOO_WEIGHTS = (1.0, -1.0) # (Maximize Score, Minimize Length)
82
81
 
83
- _INFER_STYLE_SYSTEM_PROMPT = """You are an expert in linguistic analysis and prompt engineering. Your task is to analyze a few input-output examples from a dataset and provide a concise, actionable description of the desired output style. This description will be used to guide other LLMs in generating and refining prompts.
84
-
85
- Focus on characteristics like:
86
- - **Length**: (e.g., single word, short phrase, one sentence, multiple sentences, a paragraph)
87
- - **Tone**: (e.g., factual, formal, informal, conversational, academic)
88
- - **Structure**: (e.g., direct answer first, explanation then answer, list, yes/no then explanation)
89
- - **Content Details**: (e.g., includes only the answer, includes reasoning, provides examples, avoids pleasantries)
90
- - **Keywords/Phrasing**: Any recurring keywords or phrasing patterns in the outputs.
91
-
92
- Provide a single string that summarizes this style. This summary should be directly usable as an instruction for another LLM.
93
- For example: 'Outputs should be a single, concise proper noun.' OR 'Outputs should be a short paragraph explaining the reasoning, followed by a direct answer, avoiding conversational pleasantries.' OR 'Outputs are typically 1-2 sentences, providing a direct factual answer.'
94
- Return ONLY this descriptive string, with no preamble or extra formatting.
95
- """
82
+ # Prompt constants moved into prompts.py
83
+ if TYPE_CHECKING:
84
+ _llm_deap_crossover: Any
85
+ _deap_crossover: Any
86
+ _deap_mutation: Any
87
+ _initialize_population: Any
88
+ _evaluate_prompt: Any
89
+ _infer_output_style_from_dataset: Any
96
90
 
97
91
  def __init__(
98
92
  self,
@@ -102,13 +96,13 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
102
96
  mutation_rate: float = DEFAULT_MUTATION_RATE,
103
97
  crossover_rate: float = DEFAULT_CROSSOVER_RATE,
104
98
  tournament_size: int = DEFAULT_TOURNAMENT_SIZE,
105
- num_threads: Optional[int] = None,
99
+ num_threads: int | None = None,
106
100
  elitism_size: int = DEFAULT_ELITISM_SIZE,
107
101
  adaptive_mutation: bool = DEFAULT_ADAPTIVE_MUTATION,
108
102
  enable_moo: bool = DEFAULT_ENABLE_MOO,
109
103
  enable_llm_crossover: bool = DEFAULT_ENABLE_LLM_CROSSOVER,
110
- seed: Optional[int] = DEFAULT_SEED,
111
- output_style_guidance: Optional[str] = None,
104
+ seed: int | None = DEFAULT_SEED,
105
+ output_style_guidance: str | None = None,
112
106
  infer_output_style: bool = False,
113
107
  verbose: int = 1,
114
108
  n_threads: int = DEFAULT_NUM_THREADS,
@@ -134,6 +128,12 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
134
128
  **model_kwargs: Additional model parameters
135
129
  """
136
130
  # Initialize base class first
131
+ if sys.version_info >= (3, 13):
132
+ warnings.warn(
133
+ "Python 3.13 is not officially supported (python_requires <3.13). "
134
+ "You may see asyncio teardown warnings. Prefer Python 3.12.",
135
+ RuntimeWarning,
136
+ )
137
137
  if "project_name" in model_kwargs:
138
138
  print(
139
139
  "Removing `project_name` from constructor; it now belongs in the ChatPrompt()"
@@ -163,13 +163,13 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
163
163
  self.infer_output_style = infer_output_style
164
164
  self.llm_call_counter = 0
165
165
  self._opik_client = opik_client.get_client_cached()
166
- self._current_optimization_id: Optional[str] = None
166
+ self._current_optimization_id: str | None = None
167
167
  self._current_generation = 0
168
- self._best_fitness_history: List[float] = []
168
+ self._best_fitness_history: list[float] = []
169
169
  self._generations_without_improvement = 0
170
- self._current_population: List[Any] = []
170
+ self._current_population: list[Any] = []
171
171
  self._generations_without_overall_improvement = 0
172
- self._best_primary_score_history: List[float] = []
172
+ self._best_primary_score_history: list[float] = []
173
173
  self._gens_since_pop_improvement: int = 0
174
174
 
175
175
  if self.seed is not None:
@@ -199,6 +199,9 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
199
199
  creator.create("Individual", list, fitness=fitness_attr)
200
200
 
201
201
  self.toolbox = base.Toolbox()
202
+ # Attach methods from helper mixin modules to this instance to avoid
203
+ # multiple inheritance while preserving behavior.
204
+ self._attach_helper_methods()
202
205
  self.toolbox.register(
203
206
  "default_individual", lambda: creator.Individual("placeholder")
204
207
  )
@@ -228,6 +231,65 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
228
231
  f"mutation_rate: {self.mutation_rate}, crossover_rate: {self.crossover_rate}"
229
232
  )
230
233
 
234
+ # (methods already attached above)
235
+
236
+ def _attach_helper_methods(self) -> None:
237
+ """Bind selected methods from mixin modules onto this instance."""
238
+
239
+ def bind(cls: Any, names: list[str]) -> None:
240
+ for name in names:
241
+ func = getattr(cls, name)
242
+ setattr(self, name, func.__get__(self, self.__class__))
243
+
244
+ # LLM calls
245
+ bind(LlmSupport, ["_call_model"])
246
+
247
+ # Mutations
248
+ bind(
249
+ MutationOps,
250
+ [
251
+ "_deap_mutation",
252
+ "_semantic_mutation",
253
+ "_structural_mutation",
254
+ "_word_level_mutation_prompt",
255
+ "_word_level_mutation",
256
+ "_get_synonym",
257
+ "_modify_phrase",
258
+ "_radical_innovation_mutation",
259
+ ],
260
+ )
261
+
262
+ # Crossover
263
+ bind(
264
+ CrossoverOps,
265
+ [
266
+ "_deap_crossover_chunking_strategy",
267
+ "_deap_crossover_word_level",
268
+ "_deap_crossover",
269
+ "_llm_deap_crossover",
270
+ "_extract_json_arrays",
271
+ ],
272
+ )
273
+
274
+ # Population management
275
+ bind(
276
+ PopulationOps,
277
+ [
278
+ "_initialize_population",
279
+ "_should_restart_population",
280
+ "_restart_population",
281
+ ],
282
+ )
283
+
284
+ # Evaluation
285
+ bind(EvaluationOps, ["_evaluate_prompt"])
286
+
287
+ # Helpers
288
+ bind(Helpers, ["_get_task_description_for_llm"])
289
+
290
+ # Style inference
291
+ bind(StyleOps, ["_infer_output_style_from_dataset"])
292
+
231
293
  def _get_adaptive_mutation_rate(self) -> float:
232
294
  """Calculate adaptive mutation rate based on population diversity and progress."""
233
295
  if not self.adaptive_mutation or len(self._best_fitness_history) < 2:
@@ -286,552 +348,7 @@ Return ONLY this descriptive string, with no preamble or extra formatting.
286
348
 
287
349
  return total_distance / count if count > 0 else 0.0
288
350
 
289
- def _deap_crossover_chunking_strategy(
290
- self, messages_1_str: str, messages_2_str: str
291
- ) -> Tuple[str, str]:
292
- chunks1 = [
293
- chunk.strip() for chunk in messages_1_str.split(".") if chunk.strip()
294
- ]
295
- chunks2 = [
296
- chunk.strip() for chunk in messages_2_str.split(".") if chunk.strip()
297
- ]
298
-
299
- # Try chunk-level crossover if both parents have at least 2 chunks
300
- if len(chunks1) >= 2 and len(chunks2) >= 2:
301
- min_num_chunks = min(len(chunks1), len(chunks2))
302
- # Crossover point is between 1 and min_num_chunks - 1
303
- # This requires min_num_chunks >= 2, which is already checked.
304
- point = random.randint(1, min_num_chunks - 1)
305
-
306
- child1_chunks = chunks1[:point] + chunks2[point:]
307
- child2_chunks = chunks2[:point] + chunks1[point:]
308
-
309
- child1_str = ". ".join(child1_chunks) + ("." if child1_chunks else "")
310
- child2_str = ". ".join(child2_chunks) + ("." if child2_chunks else "")
311
-
312
- return child1_str, child2_str
313
- else:
314
- raise ValueError(
315
- "Not enough chunks in either prompt for chunk-level crossover"
316
- )
317
-
318
- def _deap_crossover_word_level(
319
- self, messages_1_str: str, messages_2_str: str
320
- ) -> Tuple[str, str]:
321
- words1 = messages_1_str.split()
322
- words2 = messages_2_str.split()
323
-
324
- # If either prompt is empty (no words), return parents
325
- if not words1 or not words2:
326
- return messages_1_str, messages_2_str
327
-
328
- min_word_len = min(len(words1), len(words2))
329
- # Need at least 2 words in the shorter prompt for a valid crossover point
330
- if min_word_len < 2:
331
- return messages_1_str, messages_2_str
332
-
333
- # Crossover point for words: 1 to min_word_len - 1
334
- point = random.randint(1, min_word_len - 1)
335
- child1_words = words1[:point] + words2[point:]
336
- child2_words = words2[:point] + words1[point:]
337
-
338
- return " ".join(child1_words), " ".join(child2_words)
339
-
340
- def _deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
341
- """Enhanced crossover operation that preserves semantic meaning.
342
- Attempts chunk-level crossover first, then falls back to word-level.
343
- """
344
- reporting.display_message(
345
- " Recombining prompts by mixing and matching words and sentences.",
346
- verbose=self.verbose,
347
- )
348
- messages_1_orig: List[Dict[str, str]] = ind1
349
- messages_2_orig: List[Dict[str, str]] = ind2
350
-
351
- for i, message_1 in enumerate(messages_1_orig):
352
- role: str = message_1["role"]
353
- message_1_str: str = message_1["content"]
354
-
355
- # We check that the second message has enough AI messages and the correct role
356
- if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]["role"] == role):
357
- message_2 = messages_2_orig[i]
358
- message_2_str: str = message_2["content"]
359
-
360
- try:
361
- child1_str, child2_str = self._deap_crossover_chunking_strategy(
362
- message_1_str, message_2_str
363
- )
364
- except ValueError:
365
- child1_str, child2_str = self._deap_crossover_word_level(
366
- message_1_str, message_2_str
367
- )
368
-
369
- # Update the message content
370
- messages_1_orig[i]["content"] = child1_str
371
- messages_2_orig[i]["content"] = child2_str
372
- else:
373
- # We don't perform any crossover if there are not enough messages or the roles
374
- # don't match
375
- pass
376
-
377
- return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
378
-
379
- def _deap_mutation(
380
- self, individual: Any, initial_prompt: chat_prompt.ChatPrompt
381
- ) -> Any:
382
- """Enhanced mutation operation with multiple strategies."""
383
- prompt = chat_prompt.ChatPrompt(messages=individual)
384
-
385
- # Choose mutation strategy based on current diversity
386
- diversity = self._calculate_population_diversity()
387
-
388
- # Determine thresholds based on diversity
389
- if diversity < self.DEFAULT_DIVERSITY_THRESHOLD:
390
- # Low diversity - use more aggressive mutations (higher chance for semantic)
391
- semantic_threshold = 0.5
392
- structural_threshold = 0.8 # semantic_threshold + 0.3
393
- else:
394
- # Good diversity - use more conservative mutations (higher chance for word_level)
395
- semantic_threshold = 0.4
396
- structural_threshold = 0.7 # semantic_threshold + 0.3
397
-
398
- mutation_choice = random.random()
399
-
400
- if mutation_choice > structural_threshold:
401
- # This corresponds to the original 'else' (word_level_mutation)
402
- mutated_prompt = self._word_level_mutation_prompt(prompt)
403
- reporting.display_success(
404
- " Mutation successful, prompt has been edited by randomizing words (word-level mutation).",
405
- verbose=self.verbose,
406
- )
407
- return creator.Individual(mutated_prompt.get_messages())
408
- elif mutation_choice > semantic_threshold:
409
- # This corresponds to the original 'elif' (structural_mutation)
410
- mutated_prompt = self._structural_mutation(prompt)
411
- reporting.display_success(
412
- " Mutation successful, prompt has been edited by reordering, combining, or splitting sentences (structural mutation).",
413
- verbose=self.verbose,
414
- )
415
- return creator.Individual(mutated_prompt.get_messages())
416
- else:
417
- # This corresponds to the original 'if' (semantic_mutation)
418
- mutated_prompt = self._semantic_mutation(prompt, initial_prompt)
419
- reporting.display_success(
420
- " Mutation successful, prompt has been edited using an LLM (semantic mutation).",
421
- verbose=self.verbose,
422
- )
423
- return creator.Individual(mutated_prompt.get_messages())
424
-
425
- def _semantic_mutation(
426
- self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
427
- ) -> chat_prompt.ChatPrompt:
428
- """Enhanced semantic mutation with multiple strategies."""
429
- current_output_style_guidance = self.output_style_guidance
430
- if random.random() < 0.1:
431
- return self._radical_innovation_mutation(prompt, initial_prompt)
432
-
433
- try:
434
- strategy = random.choice(
435
- [
436
- "rephrase",
437
- "simplify",
438
- "elaborate",
439
- "restructure",
440
- "focus",
441
- "increase_complexity_and_detail",
442
- ]
443
- )
444
-
445
- strategy_prompts = {
446
- "rephrase": f"Create a different way to express the same instruction, possibly with a different length or structure, ensuring it still aims for an answer from the target LLM in the style of: '{current_output_style_guidance}'.",
447
- "simplify": f"Simplify the instruction while maintaining its core meaning, potentially making it more concise, to elicit an answer in the style of: '{current_output_style_guidance}'.",
448
- "elaborate": f"Add more relevant detail and specificity to the instruction, potentially increasing its length, but only if it helps achieve a more accurate answer from the target LLM in the style of: '{current_output_style_guidance}'.",
449
- "restructure": f"Change the structure of the instruction (e.g., reorder sentences, combine/split ideas) while keeping its intent, ensuring the new structure strongly guides towards an output in the style of: '{current_output_style_guidance}'.",
450
- "focus": f"Emphasize the key aspects of the instruction, perhaps by rephrasing or adding clarifying statements, to better elicit an answer in the style of: '{current_output_style_guidance}'.",
451
- "increase_complexity_and_detail": f"Significantly elaborate on this instruction. Add more details, examples, context, or constraints to make it more comprehensive. The goal of this elaboration is to make the prompt itself more detailed, so that it VERY CLEARLY guides the target LLM to produce a highly accurate final answer in the style of: '{current_output_style_guidance}'. The prompt can be long if needed to achieve this output style.",
452
- }
453
-
454
- user_prompt_for_semantic_mutation = f"""Given this prompt: '{prompt}'
455
- Task context: {self._get_task_description_for_llm(initial_prompt)}
456
- Desired output style from target LLM: '{current_output_style_guidance}'
457
- Instruction for this modification: {strategy_prompts[strategy]}.
458
- Return only the modified prompt message list, nothing else. Make sure to return a valid JSON object.
459
- """
460
- response = self._call_model(
461
- messages=[
462
- {
463
- "role": "system",
464
- "content": f"You are a prompt engineering expert. Your goal is to modify prompts to improve their effectiveness in eliciting specific types of answers, particularly matching the style: '{current_output_style_guidance}'. Follow the specific modification instruction provided.",
465
- },
466
- {"role": "user", "content": user_prompt_for_semantic_mutation},
467
- ],
468
- is_reasoning=True,
469
- )
470
-
471
- return chat_prompt.ChatPrompt(messages=utils.json_to_dict(response.strip()))
472
- except Exception as e:
473
- reporting.display_error(
474
- f" Error in semantic mutation, this is usually a parsing error: {e}",
475
- verbose=self.verbose,
476
- )
477
- return prompt
478
-
479
- def _structural_mutation(
480
- self, prompt: chat_prompt.ChatPrompt
481
- ) -> chat_prompt.ChatPrompt:
482
- """Perform structural mutation (reordering, combining, splitting)."""
483
- mutated_messages: List[Dict[str, str]] = []
484
-
485
- for message in prompt.get_messages():
486
- content = message["content"]
487
- role = message["role"]
488
-
489
- sentences = [s.strip() for s in content.split(".") if s.strip()]
490
- if len(sentences) <= 1:
491
- mutated_messages.append(
492
- {"role": role, "content": self._word_level_mutation(content)}
493
- )
494
- continue
495
-
496
- mutation_type = random.random()
497
- if mutation_type < 0.3:
498
- # Reorder sentences
499
- random.shuffle(sentences)
500
- mutated_messages.append(
501
- {"role": role, "content": ". ".join(sentences) + "."}
502
- )
503
- continue
504
- elif mutation_type < 0.6:
505
- # Combine adjacent sentences
506
- if len(sentences) >= 2:
507
- idx = random.randint(0, len(sentences) - 2)
508
- combined = sentences[idx] + " and " + sentences[idx + 1]
509
- sentences[idx : idx + 2] = [combined]
510
- mutated_messages.append(
511
- {"role": role, "content": ". ".join(sentences) + "."}
512
- )
513
- continue
514
- else:
515
- # Split a sentence
516
- idx = random.randint(0, len(sentences) - 1)
517
- words = sentences[idx].split()
518
- if len(words) > 3:
519
- split_point = random.randint(2, len(words) - 2)
520
- sentences[idx : idx + 1] = [
521
- " ".join(words[:split_point]),
522
- " ".join(words[split_point:]),
523
- ]
524
- mutated_messages.append(
525
- {"role": role, "content": ". ".join(sentences) + "."}
526
- )
527
- continue
528
- else:
529
- mutated_messages.append({"role": role, "content": content})
530
-
531
- return chat_prompt.ChatPrompt(messages=mutated_messages)
532
-
533
- def _word_level_mutation_prompt(
534
- self, prompt: chat_prompt.ChatPrompt
535
- ) -> chat_prompt.ChatPrompt:
536
- mutated_messages: List[Dict[str, str]] = []
537
- for message in prompt.get_messages():
538
- mutated_messages.append(
539
- {
540
- "role": message["role"],
541
- "content": self._word_level_mutation(message["content"]),
542
- }
543
- )
544
- return chat_prompt.ChatPrompt(messages=mutated_messages)
545
-
546
- def _word_level_mutation(self, msg_content: str) -> str:
547
- """Perform word-level mutation."""
548
- words = msg_content.split()
549
- if len(words) <= 1:
550
- return msg_content
551
-
552
- mutation_type = random.random()
553
- if mutation_type < 0.3:
554
- # Word replacement
555
- idx = random.randint(0, len(words) - 1)
556
- words[idx] = self._get_synonym(words[idx])
557
- elif mutation_type < 0.6:
558
- # Word reordering
559
- if len(words) > 2:
560
- i, j = random.sample(range(len(words)), 2)
561
- words[i], words[j] = words[j], words[i]
562
- else:
563
- # Phrase modification
564
- idx = random.randint(0, len(words) - 1)
565
- words[idx] = self._modify_phrase(words[idx])
566
-
567
- return " ".join(words)
568
-
569
- def _get_synonym(self, word: str) -> str:
570
- """Get a synonym for a word using LLM."""
571
- try:
572
- response = self._call_model(
573
- messages=[
574
- {
575
- "role": "system",
576
- "content": "You are a helpful assistant that provides synonyms. Return only the synonym word, no explanation or additional text.",
577
- },
578
- {
579
- "role": "user",
580
- "content": f"Give me a single synonym for the word '{word}'. Return only the synonym, nothing else.",
581
- },
582
- ],
583
- is_reasoning=True,
584
- )
585
- return response.strip()
586
- except Exception as e:
587
- logger.warning(f"Error getting synonym for '{word}': {e}")
588
- return word
589
-
590
- def _modify_phrase(self, phrase: str) -> str:
591
- """Modify a phrase while preserving meaning using LLM."""
592
- try:
593
- response = self._call_model(
594
- messages=[
595
- {
596
- "role": "system",
597
- "content": "You are a helpful assistant that rephrases text. Return only the modified phrase, no explanation or additional text.",
598
- },
599
- {
600
- "role": "user",
601
- "content": f"Modify this phrase while keeping the same meaning: '{phrase}'. Return only the modified phrase, nothing else.",
602
- },
603
- ],
604
- is_reasoning=True,
605
- )
606
- return response.strip()
607
- except Exception as e:
608
- logger.warning(f"Error modifying phrase '{phrase}': {e}")
609
- return phrase
610
-
611
- def _radical_innovation_mutation(
612
- self, prompt: chat_prompt.ChatPrompt, initial_prompt: chat_prompt.ChatPrompt
613
- ) -> chat_prompt.ChatPrompt:
614
- """Attempts to generate a significantly improved and potentially very different prompt using an LLM."""
615
- logger.debug(
616
- f"Attempting radical innovation for prompt: {json.dumps(prompt.get_messages())[:70]}..."
617
- )
618
- task_desc_for_llm = self._get_task_description_for_llm(initial_prompt)
619
- current_output_style_guidance = self.output_style_guidance
620
-
621
- user_prompt_for_radical_innovation = f"""Task Context:
622
- {task_desc_for_llm}
623
- Desired output style from target LLM: '{current_output_style_guidance}'
624
-
625
- Existing Prompt (which may be underperforming):
626
- '''{prompt.get_messages()}'''
627
-
628
- Please generate a new, significantly improved, and potentially very different prompt for this task.
629
- Focus on alternative approaches, better clarity, or more effective guidance for the language model, aiming for the desired output style.
630
- Return only the new prompt list object.
631
- """
632
- try:
633
- new_prompt_str = self._call_model(
634
- messages=[
635
- {
636
- "role": "system",
637
- "content": self._get_radical_innovation_system_prompt(),
638
- },
639
- {"role": "user", "content": user_prompt_for_radical_innovation},
640
- ],
641
- is_reasoning=True,
642
- )
643
- logger.info(
644
- f"Radical innovation generated: {new_prompt_str[:70]}... from: {json.dumps(prompt.get_messages())[:70]}..."
645
- )
646
- return chat_prompt.ChatPrompt(messages=json.loads(new_prompt_str))
647
- except Exception as e:
648
- logger.warning(
649
- f"Radical innovation mutation failed for prompt '{json.dumps(prompt.get_messages())[:50]}...': {e}. Returning original."
650
- )
651
- return prompt
652
-
653
- def _initialize_population(
654
- self, prompt: chat_prompt.ChatPrompt
655
- ) -> List[chat_prompt.ChatPrompt]:
656
- """Initialize the population with diverse variations of the initial prompt,
657
- including some 'fresh start' prompts based purely on task description.
658
- All generated prompts should aim to elicit answers matching self.output_style_guidance.
659
- """
660
- with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
661
- init_pop_report.start(self.population_size)
662
-
663
- population = [prompt]
664
- if self.population_size <= 1:
665
- return population
666
-
667
- num_to_generate_total = self.population_size - 1
668
- num_fresh_starts = max(1, int(num_to_generate_total * 0.2))
669
- num_variations_on_initial = num_to_generate_total - num_fresh_starts
670
-
671
- task_desc_for_llm = self._get_task_description_for_llm(prompt)
672
- current_output_style_guidance = self.output_style_guidance
673
-
674
- # Generate "fresh start" prompts if the initial prompt is not performing well
675
- # Cold start prompts are generated from the task description
676
- if num_fresh_starts > 0:
677
- init_pop_report.start_fresh_prompts(num_fresh_starts)
678
- fresh_start_user_prompt = f"""Here is a description of a task:
679
- {task_desc_for_llm}
680
-
681
- The goal is to generate prompts that will make a target LLM produce responses in the following style: '{current_output_style_guidance}'.
682
-
683
- Please generate {num_fresh_starts} diverse and effective prompt(s) for a language model to accomplish this task, ensuring they guide towards this specific output style.
684
- Focus on clarity, completeness, and guiding the model effectively towards the desired style. Explore different structural approaches.
685
-
686
- Example of valid response: [
687
- ["role": "<role>", "content": "<Prompt targeting specified style.>"],
688
- ["role": "<role>", "content": "<Another prompt designed for the output style.>"]
689
- ]
690
-
691
- Your response MUST be a valid JSON list of AI messages. Do NOT include any other text, explanations, or Markdown formatting like ```json ... ``` around the list.
692
-
693
- """
694
- try:
695
- response_content = self._call_model(
696
- messages=[
697
- {
698
- "role": "system",
699
- "content": f"You are an expert prompt engineer. Your task is to generate novel, effective prompts from scratch based on a task description, specifically aiming for prompts that elicit answers in the style: '{current_output_style_guidance}'. Output ONLY a raw JSON list of strings.",
700
- },
701
- {"role": "user", "content": fresh_start_user_prompt},
702
- ],
703
- is_reasoning=True,
704
- )
705
-
706
- logger.debug(
707
- f"Raw LLM response for fresh start prompts: {response_content}"
708
- )
709
-
710
- fresh_prompts = utils.json_to_dict(response_content)
711
- if isinstance(fresh_prompts, list):
712
- if all(isinstance(p, dict) for p in fresh_prompts) and all(
713
- p.get("role") is not None for p in fresh_prompts
714
- ):
715
- population.append(
716
- chat_prompt.ChatPrompt(messages=fresh_prompts)
717
- )
718
- init_pop_report.success_fresh_prompts(1)
719
- elif all(isinstance(p, list) for p in fresh_prompts):
720
- population.extend(
721
- [
722
- chat_prompt.ChatPrompt(messages=p)
723
- for p in fresh_prompts[:num_fresh_starts]
724
- ]
725
- )
726
- init_pop_report.success_fresh_prompts(
727
- len(fresh_prompts[:num_fresh_starts])
728
- )
729
- else:
730
- init_pop_report.failed_fresh_prompts(
731
- num_fresh_starts,
732
- f"LLM response for fresh starts was not a valid list of strings or was empty: {response_content}. Skipping fresh start prompts.",
733
- )
734
- except json.JSONDecodeError as e_json:
735
- init_pop_report.failed_fresh_prompts(
736
- num_fresh_starts,
737
- f"JSONDecodeError generating fresh start prompts: {e_json}. LLM response: '{response_content}'. Skipping fresh start prompts.",
738
- )
739
- except Exception as e:
740
- init_pop_report.failed_fresh_prompts(
741
- num_fresh_starts,
742
- f"Error generating fresh start prompts: {e}. Skipping fresh start prompts.",
743
- )
744
-
745
- # Generate variations on the initial prompt for the remaining slots
746
- # TODO: Could add variations with hyper-parameters from the task config like temperature, etc.
747
- if num_variations_on_initial > 0:
748
- init_pop_report.start_variations(num_variations_on_initial)
749
-
750
- # TODO: We need to split this into batches as the model will not return enough tokens
751
- # to generate all the candidates
752
- user_prompt_for_variation = f"""Initial prompt:
753
- '''{prompt.get_messages()}'''
754
-
755
- Task context:
756
- {task_desc_for_llm}
757
- Desired output style from target LLM: '{current_output_style_guidance}'
758
-
759
- Generate {num_variations_on_initial} diverse alternative prompts based on the initial prompt above, keeping the task context and desired output style in mind.
760
- All generated prompt variations should strongly aim to elicit answers from the target LLM matching the style: '{current_output_style_guidance}'.
761
- For each variation, consider how to best achieve this style, e.g., by adjusting specificity, structure, phrasing, constraints, or by explicitly requesting it.
762
-
763
- Return a JSON array of prompts with the following structure:
764
- {{
765
- "prompts": [
766
- {{
767
- "prompt": [{{"role": "<role>", "content": "<content>"}}],
768
- "strategy": "brief description of the variation strategy used, e.g., 'direct instruction for target style'"
769
- }}
770
- // ... more prompts if num_variations_on_initial > 1
771
- ]
772
- }}
773
- Ensure a good mix of variations, all targeting the specified output style from the end LLM.
774
-
775
- Return a valid JSON object that is correctly escaped. Return nothing else, d`o not include any additional text or Markdown formatting.
776
- """
777
- try:
778
- response_content_variations = self._call_model(
779
- messages=[
780
- {
781
- "role": "system",
782
- "content": self._get_reasoning_system_prompt_for_variation(),
783
- },
784
- {"role": "user", "content": user_prompt_for_variation},
785
- ],
786
- is_reasoning=True,
787
- )
788
- logger.debug(
789
- f"Raw response for population variations: {response_content_variations}"
790
- )
791
- json_response_variations = json.loads(response_content_variations)
792
- generated_prompts_variations = [
793
- p["prompt"]
794
- for p in json_response_variations.get("prompts", [])
795
- if isinstance(p, dict) and "prompt" in p
796
- ]
797
-
798
- if generated_prompts_variations:
799
- init_pop_report.success_variations(
800
- len(
801
- generated_prompts_variations[:num_variations_on_initial]
802
- )
803
- )
804
- population.extend(
805
- [
806
- chat_prompt.ChatPrompt(messages=p)
807
- for p in generated_prompts_variations[
808
- :num_variations_on_initial
809
- ]
810
- ]
811
- )
812
- else:
813
- init_pop_report.failed_variations(
814
- num_variations_on_initial,
815
- "Could not parse 'prompts' list for variations. Skipping variations.",
816
- )
817
- except Exception as e:
818
- init_pop_report.failed_variations(
819
- num_variations_on_initial,
820
- f"Error calling LLM for initial population variations: {e}",
821
- )
822
-
823
- # Ensure population is of the required size using unique prompts
824
- # TODO Test with levenshtein distance
825
- final_population_set: Set[str] = set()
826
- final_population_list: List[chat_prompt.ChatPrompt] = []
827
- for p in population:
828
- if json.dumps(p.get_messages()) not in final_population_set:
829
- final_population_set.add(json.dumps(p.get_messages()))
830
- final_population_list.append(p)
831
-
832
- init_pop_report.end(final_population_list)
833
- # Return exactly population_size prompts if possible, or fewer if generation failed badly.
834
- return final_population_list[: self.population_size]
351
+ # Mutations and helpers are implemented in mixins.
835
352
 
836
353
  def _should_restart_population(self, curr_best: float) -> bool:
837
354
  """
@@ -852,9 +369,9 @@ Return only the new prompt list object.
852
369
  def _restart_population(
853
370
  self,
854
371
  hof: tools.HallOfFame,
855
- population: List[Any],
372
+ population: list[Any],
856
373
  best_prompt_so_far: chat_prompt.ChatPrompt,
857
- ) -> List[Any]:
374
+ ) -> list[Any]:
858
375
  """Return a fresh, evaluated population seeded by elites."""
859
376
  if self.enable_moo:
860
377
  elites = list(hof)
@@ -881,12 +398,12 @@ Return only the new prompt list object.
881
398
  def _run_generation(
882
399
  self,
883
400
  generation_idx: int,
884
- population: List[Any],
401
+ population: list[Any],
885
402
  prompt: chat_prompt.ChatPrompt,
886
403
  hof: tools.HallOfFame,
887
404
  report: Any,
888
405
  best_primary_score_overall: float,
889
- ) -> tuple[List[Any], int]:
406
+ ) -> tuple[list[Any], int]:
890
407
  """Execute mating, mutation, evaluation and HoF update."""
891
408
  best_gen_score = 0.0
892
409
 
@@ -952,7 +469,7 @@ Return only the new prompt list object.
952
469
 
953
470
  return offspring, len(invalid)
954
471
 
955
- def _population_best_score(self, population: List[Any]) -> float:
472
+ def _population_best_score(self, population: list[Any]) -> float:
956
473
  """Return highest primary-objective score among *valid* individuals."""
957
474
  valid_scores = [
958
475
  ind.fitness.values[0] for ind in population if ind.fitness.valid
@@ -964,10 +481,10 @@ Return only the new prompt list object.
964
481
  prompt: chat_prompt.ChatPrompt,
965
482
  dataset: opik.Dataset,
966
483
  metric: Callable,
967
- experiment_config: Optional[Dict] = None,
968
- n_samples: Optional[int] = None,
484
+ experiment_config: dict | None = None,
485
+ n_samples: int | None = None,
969
486
  auto_continue: bool = False,
970
- agent_class: Optional[Type[OptimizableAgent]] = None,
487
+ agent_class: type[OptimizableAgent] | None = None,
971
488
  **kwargs: Any,
972
489
  ) -> OptimizationResult:
973
490
  """
@@ -1004,7 +521,7 @@ Return only the new prompt list object.
1004
521
  self.project_name = self.agent_class.project_name
1005
522
 
1006
523
  # Step 0. Start Opik optimization run
1007
- opik_optimization_run: Optional[optimization.Optimization] = None
524
+ opik_optimization_run: optimization.Optimization | None = None
1008
525
  try:
1009
526
  opik_optimization_run = self._opik_client.create_optimization(
1010
527
  dataset_name=dataset.name,
@@ -1026,18 +543,19 @@ Return only the new prompt list object.
1026
543
  reporting.display_configuration(
1027
544
  prompt.get_messages(),
1028
545
  {
1029
- "optimizer": f"{ 'DEAP MOO' if self.enable_moo else 'DEAP SO' } Evolutionary Optimization",
546
+ "optimizer": f"{'DEAP MOO' if self.enable_moo else 'DEAP SO'} Evolutionary Optimization",
1030
547
  "population_size": self.population_size,
1031
548
  "generations": self.num_generations,
1032
549
  "mutation_rate": self.mutation_rate,
1033
550
  "crossover_rate": self.crossover_rate,
1034
551
  },
1035
552
  verbose=self.verbose,
553
+ tools=getattr(prompt, "tools", None),
1036
554
  )
1037
555
 
1038
556
  # Step 1. Step variables and define fitness function
1039
557
  self.llm_call_counter = 0
1040
- self._history: List[OptimizationRound] = []
558
+ self._history: list[OptimizationRound] = []
1041
559
  self._current_generation = 0
1042
560
  self._best_fitness_history = []
1043
561
  self._generations_without_improvement = 0
@@ -1047,8 +565,8 @@ Return only the new prompt list object.
1047
565
  if self.enable_moo:
1048
566
 
1049
567
  def _deap_evaluate_individual_fitness(
1050
- messages: List[Dict[str, str]],
1051
- ) -> Tuple[float, float]:
568
+ messages: list[dict[str, str]],
569
+ ) -> tuple[float, float]:
1052
570
  primary_fitness_score: float = self._evaluate_prompt(
1053
571
  prompt,
1054
572
  messages, # type: ignore
@@ -1065,8 +583,8 @@ Return only the new prompt list object.
1065
583
  else:
1066
584
  # Single-objective
1067
585
  def _deap_evaluate_individual_fitness(
1068
- messages: List[Dict[str, str]],
1069
- ) -> Tuple[float, float]:
586
+ messages: list[dict[str, str]],
587
+ ) -> tuple[float, float]:
1070
588
  fitness_score: float = self._evaluate_prompt(
1071
589
  prompt,
1072
590
  messages, # type: ignore
@@ -1123,7 +641,7 @@ Return only the new prompt list object.
1123
641
  self.output_style_guidance = self.DEFAULT_OUTPUT_STYLE_GUIDANCE
1124
642
 
1125
643
  # Step 4. Initialize population
1126
- initial_prompts: List[chat_prompt.ChatPrompt] = self._initialize_population(
644
+ initial_prompts: list[chat_prompt.ChatPrompt] = self._initialize_population(
1127
645
  prompt=prompt
1128
646
  )
1129
647
 
@@ -1143,7 +661,7 @@ Return only the new prompt list object.
1143
661
  with reporting.evaluate_initial_population(
1144
662
  verbose=self.verbose
1145
663
  ) as report_initial_population:
1146
- fitnesses: List[Any] = list(map(self.toolbox.evaluate, deap_population))
664
+ fitnesses: list[Any] = list(map(self.toolbox.evaluate, deap_population))
1147
665
  _best_score = max(
1148
666
  best_primary_score_overall, max([x[0] for x in fitnesses])
1149
667
  )
@@ -1302,7 +820,7 @@ Return only the new prompt list object.
1302
820
  hof, key=lambda ind: ind.fitness.values[0], reverse=True
1303
821
  )
1304
822
  for i, sol in enumerate(sorted_hof):
1305
- final_results_log += f" Solution {i+1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
823
+ final_results_log += f" Solution {i + 1}: Primary Score={sol.fitness.values[0]:.4f}, Length={sol.fitness.values[1]:.0f}, Prompt='{str(sol)[:100]}...'\n"
1306
824
  best_overall_solution = sorted_hof[0]
1307
825
  final_best_prompt = chat_prompt.ChatPrompt(
1308
826
  messages=best_overall_solution
@@ -1419,6 +937,7 @@ Return only the new prompt list object.
1419
937
  best_score=final_primary_score,
1420
938
  best_prompt=final_best_prompt.get_messages(),
1421
939
  verbose=self.verbose,
940
+ tools=getattr(final_best_prompt, "tools", None),
1422
941
  )
1423
942
  return OptimizationResult(
1424
943
  optimizer=self.__class__.__name__,
@@ -1434,353 +953,17 @@ Return only the new prompt list object.
1434
953
  optimization_id=self._current_optimization_id,
1435
954
  )
1436
955
 
1437
- @_throttle.rate_limited(_rate_limiter)
1438
- def _call_model(
1439
- self,
1440
- messages: List[Dict[str, str]],
1441
- is_reasoning: bool = False,
1442
- optimization_id: Optional[str] = None,
1443
- ) -> str:
1444
- """Call the model with the given prompt and return the response."""
1445
- try:
1446
- # Basic LLM parameters
1447
- llm_config_params = {
1448
- "temperature": getattr(self, "temperature", 0.3),
1449
- "max_tokens": getattr(self, "max_tokens", 1000),
1450
- "top_p": getattr(self, "top_p", 1.0),
1451
- "frequency_penalty": getattr(self, "frequency_penalty", 0.0),
1452
- "presence_penalty": getattr(self, "presence_penalty", 0.0),
1453
- }
1454
-
1455
- # Prepare metadata for opik
1456
- metadata_for_opik: Dict[str, Any] = {}
1457
- if self.project_name:
1458
- metadata_for_opik["project_name"] = self.project_name
1459
- metadata_for_opik["opik"] = {"project_name": self.project_name}
1460
-
1461
- if optimization_id:
1462
- if "opik" in metadata_for_opik:
1463
- metadata_for_opik["opik"]["optimization_id"] = optimization_id
1464
-
1465
- metadata_for_opik["optimizer_name"] = self.__class__.__name__
1466
- metadata_for_opik["opik_call_type"] = (
1467
- "reasoning" if is_reasoning else "evaluation_llm_task_direct"
1468
- )
1469
-
1470
- if metadata_for_opik:
1471
- llm_config_params["metadata"] = metadata_for_opik
1472
-
1473
- # Pass llm_config_params to the Opik monitor
1474
- final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
1475
- llm_config_params.copy()
1476
- )
1477
-
1478
- logger.debug(
1479
- f"Calling model '{self.model}' with messages: {messages}, "
1480
- f"final params for litellm (from monitor): {final_call_params}"
1481
- )
1482
-
1483
- response = litellm.completion(
1484
- model=self.model, messages=messages, **final_call_params
1485
- )
1486
- self.llm_call_counter += 1
1487
-
1488
- logger.debug(f"Response: {response}")
1489
- return response.choices[0].message.content
1490
- except litellm_exceptions.RateLimitError as e:
1491
- logger.error(f"LiteLLM Rate Limit Error: {e}")
1492
- raise
1493
- except litellm_exceptions.APIConnectionError as e:
1494
- logger.error(f"LiteLLM API Connection Error: {e}")
1495
- raise
1496
- except litellm_exceptions.ContextWindowExceededError as e:
1497
- logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
1498
- raise
1499
- except Exception as e:
1500
- logger.error(
1501
- f"Error calling model '{self.model}': {type(e).__name__} - {e}"
1502
- )
1503
- raise
1504
-
1505
- def _evaluate_prompt(
1506
- self,
1507
- prompt: chat_prompt.ChatPrompt,
1508
- messages: List[Dict[str, str]],
1509
- dataset: opik.Dataset,
1510
- metric: Callable,
1511
- n_samples: Optional[int] = None,
1512
- dataset_item_ids: Optional[List[str]] = None,
1513
- experiment_config: Optional[Dict] = None,
1514
- optimization_id: Optional[str] = None,
1515
- verbose: int = 0,
1516
- **kwargs: Any,
1517
- ) -> float:
1518
- """
1519
- Evaluate a single prompt (individual) against the dataset.
1520
-
1521
- Args:
1522
- prompt:
1523
- dataset: The dataset to use for evaluation
1524
- metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
1525
- n_samples: Optional number of samples to use
1526
- dataset_item_ids: Optional list of dataset item IDs to use
1527
- experiment_config: Optional experiment configuration
1528
- optimization_id: Optional optimization ID
1529
- verbose: Controls internal logging/progress bars (0=off, 1=on).
1530
-
1531
- Returns:
1532
- float: The metric value
1533
- """
1534
- total_items = len(dataset.get_items())
1535
-
1536
- new_prompt = prompt.copy()
1537
- new_prompt.set_messages(messages)
1538
-
1539
- experiment_config = experiment_config or {}
1540
- experiment_config["project_name"] = self.agent_class.project_name
1541
- experiment_config = {
1542
- **experiment_config,
1543
- "optimizer": self.__class__.__name__,
1544
- "agent_class": self.agent_class.__name__,
1545
- "agent_config": new_prompt.to_dict(),
1546
- "metric": metric.__name__,
1547
- "dataset": dataset.name,
1548
- "configuration": {
1549
- "prompt": new_prompt.get_messages(),
1550
- "n_samples_for_eval": (
1551
- len(dataset_item_ids) if dataset_item_ids is not None else n_samples
1552
- ),
1553
- "total_dataset_items": total_items,
1554
- },
1555
- }
1556
- try:
1557
- agent = self.agent_class(new_prompt)
1558
- except Exception:
1559
- return 0.0
1560
-
1561
- def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
1562
- # print("MESSAGES:", new_prompt.messages)
1563
- messages = new_prompt.get_messages(dataset_item)
1564
- model_output = agent.invoke(messages)
1565
- # print("OUTPUT:", model_output)
1566
- return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
1567
-
1568
- # Evaluate the prompt
1569
- score = task_evaluator.evaluate(
1570
- dataset=dataset,
1571
- dataset_item_ids=dataset_item_ids,
1572
- metric=metric,
1573
- evaluated_task=llm_task,
1574
- num_threads=self.num_threads,
1575
- project_name=experiment_config["project_name"],
1576
- n_samples=n_samples if dataset_item_ids is None else None,
1577
- experiment_config=experiment_config,
1578
- optimization_id=optimization_id,
1579
- verbose=verbose,
1580
- )
1581
- return score
1582
-
1583
- def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> Tuple[Any, Any]:
1584
- """Perform crossover by asking an LLM to blend two parent prompts."""
1585
- reporting.display_message(
1586
- " Recombining prompts using an LLM.", verbose=self.verbose
1587
- )
1588
-
1589
- parent1_messages: List[Dict[str, str]] = ind1
1590
- parent2_messages: List[Dict[str, str]] = ind2
1591
- current_output_style_guidance = self.output_style_guidance
956
+ # Evaluation is provided by EvaluationOps
1592
957
 
1593
- user_prompt_for_llm_crossover = f"""Parent Prompt 1:
1594
- '''{parent1_messages}'''
1595
-
1596
- Parent Prompt 2:
1597
- '''{parent2_messages}'''
1598
-
1599
- Desired output style from target LLM for children prompts: '{current_output_style_guidance}'
1600
-
1601
- Please generate TWO child prompts by intelligently blending the ideas, styles, or structures from these two parents, ensuring the children aim to elicit the desired output style.
1602
- Follow the instructions provided in the system prompt regarding the JSON output format:
1603
- [
1604
- [{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_1
1605
- [{{"role": "<role>", "content": "<content>"}}, {{"role": "<role>", "content": "<content>"}}], #child_2
1606
- ]
1607
- """
1608
- try:
1609
- logger.debug(
1610
- f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'"
1611
- )
1612
- response_content = self._call_model(
1613
- messages=[
1614
- {
1615
- "role": "system",
1616
- "content": self.get_llm_crossover_system_prompt(),
1617
- },
1618
- {"role": "user", "content": user_prompt_for_llm_crossover},
1619
- ],
1620
- is_reasoning=True,
1621
- )
1622
- logger.debug(f"Raw LLM response for crossover: {response_content}")
1623
-
1624
- json_response = utils.json_to_dict(response_content)
1625
- if (
1626
- not isinstance(json_response, list)
1627
- or len(json_response) != 2
1628
- or not all(isinstance(cs, list) for cs in json_response)
1629
- ):
1630
- logger.warning(
1631
- "LLM Crossover: Malformed or empty children_prompts list. Falling back."
1632
- )
1633
- raise ValueError("Malformed LLM crossover response")
1634
-
1635
- child1: List[Dict[str, str]] = json_response[0]
1636
- child2: List[Dict[str, str]] = json_response[1]
1637
-
1638
- logger.debug(
1639
- f"LLM Crossover generated child1: {json.dumps(child1)[:50]}... Child2: {json.dumps(child2)[:50]}..."
1640
- )
1641
- return creator.Individual(child1), creator.Individual(child2)
1642
-
1643
- except Exception as e:
1644
- logger.warning(
1645
- f"LLM-driven crossover failed: {e}. Falling back to standard crossover."
1646
- )
1647
- return self._deap_crossover(ind1, ind2)
1648
-
1649
- def _get_task_description_for_llm(self, prompt: chat_prompt.ChatPrompt) -> str:
1650
- """Generates a concise task description for use in LLM prompts for fresh generation or radical innovation."""
1651
- description = "Task: Given a list of AI messages with placeholder values, generate an effective prompt. "
1652
- description += f"The original high-level instruction being optimized is: '{prompt.get_messages()}'. "
1653
- description += "The goal is to create an effective prompt that guides a language model to perform this task well."
1654
- return description
958
+ # LLM crossover is provided by CrossoverOps
959
+ # Helper provided by Helpers
1655
960
 
961
+ # Override prompt builders to centralize strings in prompts.py
1656
962
  def _get_reasoning_system_prompt_for_variation(self) -> str:
1657
- return f"""You are an expert prompt engineer specializing in creating diverse and effective prompts. Given an initial prompt, your task is to generate a diverse set of alternative prompts.
1658
-
1659
- For each prompt variation, consider:
1660
- 1. Different levels of specificity and detail, including significantly more detailed and longer versions.
1661
- 2. Various ways to structure the instruction, exploring more complex sentence structures and phrasings.
1662
- 3. Alternative phrasings that maintain the core intent but vary in style and complexity.
1663
- 4. Different emphasis on key components, potentially elaborating on them.
1664
- 5. Various ways to express constraints or requirements.
1665
- 6. Different approaches to clarity and conciseness, but also explore more verbose and explanatory styles.
1666
- 7. Alternative ways to guide the model's response format.
1667
- 8. Consider variations that are substantially longer and more descriptive than the original.
1668
-
1669
- The generated prompts should guide a target LLM to produce outputs in the following style: '{self.output_style_guidance}'
1670
-
1671
- Return a JSON array of prompts with the following structure:
1672
- {{
1673
- "prompts": [
1674
- {{
1675
- "prompt": "alternative prompt 1",
1676
- "strategy": "brief description of the variation strategy used, e.g., 'focused on eliciting specific output style'"
1677
- }},
1678
- {{
1679
- "prompt": "alternative prompt 2",
1680
- "strategy": "brief description of the variation strategy used"
1681
- }}
1682
- ]
1683
- }}
1684
- Each prompt variation should aim to get the target LLM to produce answers matching the desired style: '{self.output_style_guidance}'.
1685
- """
963
+ return evo_prompts.variation_system_prompt(self.output_style_guidance)
1686
964
 
1687
965
  def get_llm_crossover_system_prompt(self) -> str:
1688
- return f"""You are an expert prompt engineer specializing in creating novel prompts by intelligently blending existing ones.
1689
- Given two parent prompts, your task is to generate one or two new child prompts that effectively combine the strengths, styles, or core ideas of both parents.
1690
- The children should be coherent and aim to explore a potentially more effective region of the prompt design space, with a key goal of eliciting responses from the target language model in the following style: '{self.output_style_guidance}'.
1691
-
1692
- Consider the following when generating children:
1693
- - Identify the key instructions, constraints, and desired output formats in each parent, paying attention to any hints about desired output style.
1694
- - Explore ways to merge these elements such that the resulting prompt strongly guides the target LLM towards the desired output style.
1695
- - You can create a child that is a direct blend, or one that takes a primary structure from one parent and incorporates specific elements from the other, always optimizing for clear instruction towards the desired output style.
1696
- - If generating two children, try to make them distinct from each other and from the parents, perhaps by emphasizing different aspects of the parental combination that could lead to the desired output style.
1697
-
1698
- All generated prompts must aim for eliciting answers in the style: '{self.output_style_guidance}'.
1699
-
1700
- Return a JSON object that is a list of both child prompts. Each child prompt is a list of LLM messages. Example:
1701
- [
1702
- [{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}],
1703
- [{{"role": "<role>", "content": "<content>"}},{{"role": "<role>", "content": "<content>"}}]
1704
- ]
1705
-
1706
-
1707
- """
966
+ return evo_prompts.llm_crossover_system_prompt(self.output_style_guidance)
1708
967
 
1709
968
  def _get_radical_innovation_system_prompt(self) -> str:
1710
- return f"""You are an expert prompt engineer and a creative problem solver.
1711
- Given a task description and an existing prompt for that task (which might be underperforming), your goal is to generate a new, significantly improved, and potentially very different prompt.
1712
- Do not just make minor edits. Think about alternative approaches, structures, and phrasings that could lead to better performance.
1713
- Consider clarity, specificity, constraints, and how to best guide the language model for the described task TO PRODUCE OUTPUTS IN THE FOLLOWING STYLE: '{self.output_style_guidance}'.
1714
- Return only the new prompt string, with no preamble or explanation.
1715
- """
1716
-
1717
- def _infer_output_style_from_dataset(
1718
- self, dataset: opik.Dataset, prompt: chat_prompt.ChatPrompt, n_examples: int = 5
1719
- ) -> Optional[str]:
1720
- """Analyzes dataset examples to infer the desired output style."""
1721
- with reporting.infer_output_style(
1722
- verbose=self.verbose
1723
- ) as report_infer_output_style:
1724
- report_infer_output_style.start_style_inference()
1725
-
1726
- try:
1727
- items_to_process = dataset.get_items(n_examples)
1728
- except Exception as e:
1729
- report_infer_output_style.error(
1730
- f"Failed to get items from dataset '{dataset.name}': {e}"
1731
- )
1732
- return None
1733
-
1734
- if not items_to_process:
1735
- report_infer_output_style.error(
1736
- f"Dataset '{dataset.name}' is empty. Cannot infer output style."
1737
- )
1738
- return None
1739
-
1740
- # Need at least a couple of examples for meaningful inference
1741
- if len(items_to_process) < min(n_examples, 2):
1742
- report_infer_output_style.error(
1743
- f"Not enough dataset items (found {len(items_to_process)}) to reliably infer output style. Need at least {min(n_examples,2)}."
1744
- )
1745
- return None
1746
-
1747
- examples_str = ""
1748
- for i, item_content in enumerate(items_to_process):
1749
- filtered_content = {x: y for x, y in item_content.items() if x != "id"}
1750
- examples_str += (
1751
- f"Example {i+1}:\nDataset Item:\n{filtered_content}\n---\n"
1752
- )
1753
-
1754
- user_prompt_for_style_inference = f"""Please analyze the following examples from a dataset and provide a concise, actionable description of the REQUIRED output style for the target LLM. Before describing the output style, make sure to understand the dataset content and structure as it can include input, output and metadata fields. This description will be used to guide other LLMs in generating and refining prompts.
1755
-
1756
- {examples_str}
1757
-
1758
- Based on these examples, what is the desired output style description?
1759
- Remember to focus on aspects like length, tone, structure, content details, and any recurring keywords or phrasing patterns in the outputs.
1760
- The description should be a single string that can be directly used as an instruction for another LLM.
1761
- Return ONLY this descriptive string.
1762
- """
1763
- # report_infer_output_style.display_style_inference_prompt(user_prompt_for_style_inference)
1764
-
1765
- try:
1766
- inferred_style = self._call_model(
1767
- messages=[
1768
- {"role": "system", "content": self._INFER_STYLE_SYSTEM_PROMPT},
1769
- {"role": "user", "content": user_prompt_for_style_inference},
1770
- ],
1771
- is_reasoning=True,
1772
- )
1773
- inferred_style = inferred_style.strip()
1774
- if inferred_style:
1775
- report_infer_output_style.success(inferred_style)
1776
- return inferred_style
1777
- else:
1778
- report_infer_output_style.error(
1779
- "LLM returned empty string for inferred output style."
1780
- )
1781
- return None
1782
- except Exception as e:
1783
- report_infer_output_style.error(
1784
- f"Error during output style inference: {e}"
1785
- )
1786
- return None
969
+ return evo_prompts.radical_innovation_system_prompt(self.output_style_guidance)