opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. opik_optimizer/__init__.py +15 -26
  2. opik_optimizer/base_optimizer.py +28 -44
  3. opik_optimizer/data/hotpot-500.json +501 -1001
  4. opik_optimizer/datasets/__init__.py +6 -7
  5. opik_optimizer/datasets/hotpot_qa.py +2 -1
  6. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
  7. opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
  8. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +296 -194
  9. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
  10. opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
  11. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
  12. opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
  13. opik_optimizer/mipro_optimizer/__init__.py +1 -1
  14. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
  15. opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
  16. opik_optimizer/mipro_optimizer/utils.py +1 -23
  17. opik_optimizer/optimization_config/chat_prompt.py +106 -0
  18. opik_optimizer/optimization_config/configs.py +2 -21
  19. opik_optimizer/optimization_config/mappers.py +1 -1
  20. opik_optimizer/optimization_result.py +57 -85
  21. opik_optimizer/reporting_utils.py +180 -0
  22. opik_optimizer/task_evaluator.py +33 -25
  23. opik_optimizer/utils.py +187 -3
  24. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/METADATA +15 -31
  25. opik_optimizer-0.9.0rc0.dist-info/RECORD +48 -0
  26. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/WHEEL +1 -1
  27. opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
  28. opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
  29. opik_optimizer/integrations/__init__.py +0 -0
  30. opik_optimizer/meta_prompt_optimizer.py +0 -1151
  31. opik_optimizer-0.8.0.dist-info/RECORD +0 -45
  32. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/licenses/LICENSE +0 -0
  33. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,816 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from typing import Any, Callable, Dict, List, Optional, overload
5
+
6
+ import litellm
7
+ import opik
8
+ from litellm.caching import Cache
9
+ from litellm.types.caching import LiteLLMCacheType
10
+ from opik import Dataset
11
+ from opik.api_objects import opik_client
12
+ from opik.environment import get_tqdm_for_current_environment
13
+ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
14
+
15
+ from opik_optimizer import task_evaluator
16
+
17
+ from .. import _throttle
18
+ from ..base_optimizer import BaseOptimizer, OptimizationRound
19
+ from ..optimization_config import chat_prompt, mappers
20
+ from ..optimization_result import OptimizationResult
21
+ from . import reporting
22
+
23
+ tqdm = get_tqdm_for_current_environment()
24
+
25
+ # Using disk cache for LLM calls
26
+ disk_cache_dir = os.path.expanduser("~/.litellm_cache")
27
+ litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
28
+
29
+ # Set up logging
30
+ logger = logging.getLogger(__name__) # Gets logger configured by setup_logging
31
+
32
+ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
33
+
34
+
35
+ class MetaPromptOptimizer(BaseOptimizer):
36
+ """
37
+ The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
38
+
39
+ This algorithm is best used when you have a prompt and would like to make sure it follows best
40
+ practices.
41
+ """
42
+ # --- Constants for Default Configuration ---
43
+ DEFAULT_ROUNDS = 3
44
+ DEFAULT_PROMPTS_PER_ROUND = 4
45
+
46
+ # --- Reasoning System Prompt ---
47
+ _REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
48
+
49
+ Focus on making the prompt more effective by:
50
+ 1. Being clear and specific about what is expected
51
+ 2. Providing necessary context and constraints
52
+ 3. Guiding the model to produce the desired output format
53
+ 4. Removing ambiguity and unnecessary elements
54
+ 5. Maintaining conciseness while being complete
55
+
56
+ Instructions:
57
+ 1. If there is a system prompt, prioritize adding instructions there if and only if it makes sense.
58
+ 2. DO NOT add any variables or parameters to the prompt you are editing.
59
+ 3. You can reuse variables that already exist in the prompt.
60
+
61
+ Return a JSON array of prompts with the following structure. Make sure to return a valid
62
+ JSON object with correct use of double quotes and single quotes. JSON keys should be
63
+ double-quoted:
64
+ {
65
+ "prompts": [
66
+ {
67
+ "prompt": [{"role": "<role>", "content": "<content>"}],
68
+ "improvement_focus": "what aspect this prompt improves",
69
+ "reasoning": "why this improvement should help"
70
+ },
71
+ {
72
+ "prompt": [{"role": "<role>", "content": "<content>"}],
73
+ "improvement_focus": "what aspect this prompt improves",
74
+ "reasoning": "why this improvement should help"
75
+ }
76
+ ]
77
+ }"""
78
+
79
+ def __init__(
80
+ self,
81
+ model: str,
82
+ reasoning_model: Optional[str] = None,
83
+ rounds: int = DEFAULT_ROUNDS,
84
+ num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
85
+ num_threads: int = 12,
86
+ project_name: str = "Optimization",
87
+ verbose: int = 1,
88
+ enable_context: bool = True,
89
+ **model_kwargs,
90
+ ):
91
+ """
92
+ Args:
93
+ model: The model to use for evaluation
94
+ reasoning_model: The model to use for reasoning and prompt generation
95
+ rounds: Number of optimization rounds
96
+ num_prompts_per_round: Number of prompts to generate per round
97
+ num_threads: Number of threads for parallel evaluation
98
+ project_name: Optional project name for tracking
99
+ verbose: Controls internal logging/progress bars (0=off, 1=on).
100
+ enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
101
+ **model_kwargs: Additional model parameters
102
+ """
103
+ super().__init__(model=model, project_name=project_name, **model_kwargs)
104
+ self.reasoning_model = reasoning_model if reasoning_model is not None else model
105
+ self.rounds = rounds
106
+ self.num_prompts_per_round = num_prompts_per_round
107
+ self.num_threads = num_threads
108
+ self.verbose = verbose
109
+ self.dataset = None
110
+ self._opik_client = opik_client.get_client_cached()
111
+ self.llm_call_counter = 0
112
+ self.enable_context = enable_context
113
+ logger.debug(
114
+ f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
115
+ )
116
+ logger.debug(
117
+ f"Optimization rounds: {rounds}, Prompts/round: {num_prompts_per_round}"
118
+ )
119
+
120
+ @_throttle.rate_limited(_rate_limiter)
121
+ def _call_model(
122
+ self,
123
+ messages: List[Dict[str, str]],
124
+ is_reasoning: bool = False,
125
+ optimization_id: Optional[str] = None,
126
+ ) -> str:
127
+ """Call the model with the given prompt and return the response."""
128
+ self.llm_call_counter += 1
129
+ # Note: Basic retry logic could be added here using tenacity
130
+ try:
131
+ # Basic LLM parameters (e.g., temperature, max_tokens)
132
+ base_temperature = getattr(self, "temperature", 0.3)
133
+ base_max_tokens = getattr(self, "max_tokens", 1000)
134
+
135
+ # Use potentially different settings for reasoning calls
136
+ reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
137
+ # Increase max_tokens for reasoning to ensure JSON fits, unless already high
138
+ reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
139
+
140
+ llm_config_params = {
141
+ "temperature": reasoning_temperature if is_reasoning else base_temperature,
142
+ "max_tokens": reasoning_max_tokens,
143
+ "top_p": getattr(self, "top_p", 1.0),
144
+ "frequency_penalty": getattr(self, "frequency_penalty", 0.0),
145
+ "presence_penalty": getattr(self, "presence_penalty", 0.0),
146
+ }
147
+
148
+ # Prepare metadata that we want to be part of the LLM call context.
149
+ metadata_for_opik = {}
150
+ if self.project_name:
151
+ metadata_for_opik["project_name"] = (
152
+ self.project_name
153
+ ) # Top-level for general use
154
+ metadata_for_opik["opik"] = {"project_name": self.project_name}
155
+
156
+ if optimization_id:
157
+ # Also add to opik-specific structure if project_name was added
158
+ if "opik" in metadata_for_opik:
159
+ metadata_for_opik["opik"]["optimization_id"] = optimization_id
160
+
161
+ metadata_for_opik["optimizer_name"] = self.__class__.__name__
162
+ metadata_for_opik["opik_call_type"] = (
163
+ "reasoning" if is_reasoning else "evaluation_llm_task_direct"
164
+ )
165
+
166
+ if metadata_for_opik:
167
+ llm_config_params["metadata"] = metadata_for_opik
168
+
169
+ model_to_use = self.reasoning_model if is_reasoning else self.model
170
+
171
+ # Pass llm_config_params (which now includes our metadata) to the Opik monitor.
172
+ # The monitor is expected to return a dictionary suitable for spreading into litellm.completion,
173
+ # having handled our metadata and added any Opik-specific configurations.
174
+ final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
175
+ llm_config_params.copy()
176
+ )
177
+
178
+ logger.debug(
179
+ f"Calling model '{model_to_use}' with messages: {messages}, "
180
+ f"final params for litellm (from monitor): {final_call_params}"
181
+ )
182
+
183
+ response = litellm.completion(
184
+ model=model_to_use,
185
+ messages=messages,
186
+ num_retries=6,
187
+ **final_call_params
188
+ )
189
+ return response.choices[0].message.content
190
+ except litellm.exceptions.RateLimitError as e:
191
+ logger.error(f"LiteLLM Rate Limit Error: {e}")
192
+ raise
193
+ except litellm.exceptions.APIConnectionError as e:
194
+ logger.error(f"LiteLLM API Connection Error: {e}")
195
+ raise
196
+ except litellm.exceptions.ContextWindowExceededError as e:
197
+ logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
198
+ # Log prompt length if possible? Needs access to prompt_for_llm here.
199
+ raise
200
+ except Exception as e:
201
+ logger.error(
202
+ f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
203
+ )
204
+ raise
205
+
206
+ # type: ignore
207
+ def evaluate_prompt(
208
+ self,
209
+ prompt: chat_prompt.ChatPrompt,
210
+ dataset: opik.Dataset,
211
+ metric: Callable,
212
+ use_full_dataset: bool = True,
213
+ experiment_config: Optional[Dict] = None,
214
+ n_samples: Optional[int] = None,
215
+ optimization_id: Optional[str] = None,
216
+ verbose: int = 1,
217
+ ) -> float:
218
+ """
219
+ Args:
220
+ prompt: The prompt to evaluate
221
+ dataset: Opik Dataset to evaluate the prompt on
222
+ metric: Metric functions
223
+ use_full_dataset: Whether to use the full dataset or a subset
224
+ experiment_config: Optional configuration for the experiment, useful to log additional metadata
225
+ n_samples: Optional number of items to test in the dataset
226
+ optimization_id: Optional ID of the optimization
227
+ verbose: Controls internal logging/progress bars (0=off, 1=on).
228
+ Returns:
229
+ float: The evaluation score
230
+ """
231
+ # Calculate subset size for trials
232
+ if not use_full_dataset:
233
+ total_items = len(dataset.get_items())
234
+ if n_samples is not None:
235
+ if n_samples > total_items:
236
+ logger.warning(
237
+ f"Requested n_samples ({n_samples}) is larger than dataset size ({total_items}). Using full dataset."
238
+ )
239
+ subset_size = None
240
+ else:
241
+ subset_size = n_samples
242
+ logger.debug(f"Using specified n_samples: {subset_size} items")
243
+ else:
244
+ # Calculate 20% of total, but no more than 20 items and no more than total items
245
+ subset_size = min(total_items, min(20, max(10, int(total_items * 0.2))))
246
+ logger.debug(
247
+ f"Using automatic subset size calculation: {subset_size} items (20% of {total_items} total items)"
248
+ )
249
+ else:
250
+ subset_size = None # Use all items for final checks
251
+ logger.debug("Using full dataset for evaluation")
252
+
253
+ experiment_config = experiment_config or {}
254
+ experiment_config = {
255
+ **experiment_config,
256
+ **{
257
+ "optimizer": self.__class__.__name__,
258
+ "metric": metric.__name__,
259
+ "dataset": dataset.name,
260
+ "configuration": {
261
+ "prompt": prompt.formatted_messages,
262
+ "n_samples": subset_size,
263
+ "use_full_dataset": use_full_dataset,
264
+ },
265
+ },
266
+ }
267
+ if optimization_id:
268
+ experiment_config["optimization_id"] = optimization_id
269
+
270
+ def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
271
+ # --- Step 1: Prepare the prompt for the LLM ---
272
+ messages = [{
273
+ "role": item["role"],
274
+ "content": item["content"].format(**dataset_item)
275
+ } for item in prompt.formatted_messages]
276
+
277
+ # --- Step 2: Call the model ---
278
+ try:
279
+ logger.debug(f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}")
280
+ raw_model_output = self._call_model(
281
+ messages=messages,
282
+ is_reasoning=False,
283
+ optimization_id=optimization_id,
284
+ )
285
+ logger.debug(f"LLM raw response length: {len(raw_model_output)}")
286
+ logger.debug(f"LLM raw output: {raw_model_output}")
287
+ except Exception as e:
288
+ logger.error(f"Error calling model with prompt: {e}")
289
+ logger.error(f"Failed prompt: {messages}")
290
+ logger.error(f"Prompt length: {sum(len(msg['content']) for msg in messages)}")
291
+ raise
292
+
293
+ # --- Step 3: Clean the model's output before metric evaluation ---
294
+ cleaned_model_output = raw_model_output.strip()
295
+
296
+ result = {
297
+ mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
298
+ }
299
+ return result
300
+
301
+ # Use dataset's get_items with limit for sampling
302
+ logger.debug(
303
+ f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {metric.__name__}"
304
+ )
305
+ score = task_evaluator.evaluate(
306
+ dataset=dataset,
307
+ metric=metric,
308
+ evaluated_task=llm_task,
309
+ num_threads=self.num_threads,
310
+ project_name=self.project_name,
311
+ n_samples=subset_size, # Use subset_size for trials, None for full dataset
312
+ experiment_config=experiment_config,
313
+ optimization_id=optimization_id,
314
+ verbose=self.verbose,
315
+ )
316
+ logger.debug(f"Evaluation score: {score:.4f}")
317
+ return score
318
+
319
+ def optimize_prompt( # type: ignore[override]
320
+ self,
321
+ prompt: chat_prompt.ChatPrompt,
322
+ dataset: Dataset,
323
+ metric: Callable,
324
+ experiment_config: Optional[Dict] = None,
325
+ n_samples: Optional[int] = None,
326
+ auto_continue: bool = False,
327
+ **kwargs,
328
+ ) -> OptimizationResult:
329
+ """
330
+ Optimize a prompt using meta-reasoning.
331
+
332
+ Args:
333
+ prompt: The prompt to optimize
334
+ dataset: The dataset to evaluate against
335
+ metric: The metric to use for evaluation
336
+ experiment_config: A dictionary to log with the experiments
337
+ n_samples: The number of dataset items to use for evaluation
338
+ auto_continue: If True, the algorithm may continue if goal not met
339
+ **kwargs: Additional arguments for evaluation
340
+
341
+ Returns:
342
+ OptimizationResult: Structured result containing optimization details
343
+ """
344
+ reporting.display_header(self.__class__.__name__, verbose=self.verbose)
345
+
346
+ total_items = len(dataset.get_items())
347
+ if n_samples is not None and n_samples > total_items:
348
+ logger.warning(
349
+ f"Requested n_samples ({n_samples}) is larger than dataset size ({total_items}). Using full dataset."
350
+ )
351
+ n_samples = None
352
+
353
+ reporting.display_configuration(
354
+ messages=prompt.formatted_messages,
355
+ optimizer_config={
356
+ "optimizer": self.__class__.__name__,
357
+ "n_samples": n_samples,
358
+ "auto_continue": auto_continue
359
+ },
360
+ verbose=self.verbose
361
+ )
362
+
363
+ optimization = None
364
+ try:
365
+ optimization = self._opik_client.create_optimization(
366
+ dataset_name=dataset.name,
367
+ objective_name=metric.__name__,
368
+ metadata={"optimizer": self.__class__.__name__},
369
+ )
370
+ logger.debug(f"Created optimization with ID: {optimization.id}")
371
+ except Exception as e:
372
+ logger.warning(
373
+ f"Opik server does not support optimizations: {e}. Please upgrade opik."
374
+ )
375
+ optimization = None
376
+
377
+ try:
378
+ result = self._optimize_prompt(
379
+ optimization_id=optimization.id if optimization is not None else None,
380
+ prompt=prompt,
381
+ dataset=dataset,
382
+ metric=metric,
383
+ experiment_config=experiment_config,
384
+ n_samples=n_samples,
385
+ auto_continue=auto_continue,
386
+ **kwargs,
387
+ )
388
+ if optimization:
389
+ self.update_optimization(optimization, status="completed")
390
+ logger.debug("Optimization completed successfully")
391
+ return result
392
+ except Exception as e:
393
+ logger.error(f"Optimization failed: {e}")
394
+ if optimization:
395
+ self.update_optimization(optimization, status="cancelled")
396
+ logger.debug("Optimization marked as cancelled")
397
+ raise e
398
+
399
+ def _optimize_prompt(
400
+ self,
401
+ optimization_id: str,
402
+ prompt: chat_prompt.ChatPrompt,
403
+ dataset: Dataset,
404
+ metric: Callable,
405
+ experiment_config: Optional[Dict],
406
+ n_samples: int,
407
+ auto_continue: bool,
408
+ **kwargs,
409
+ ) -> OptimizationResult:
410
+ self.auto_continue = auto_continue
411
+ self.dataset = dataset
412
+ self.prompt = prompt
413
+ self.llm_call_counter = 0 # Reset counter for run
414
+
415
+ current_prompt = prompt.formatted_messages
416
+ experiment_config = experiment_config or {}
417
+ experiment_config = {
418
+ **experiment_config,
419
+ **{
420
+ "optimizer": self.__class__.__name__,
421
+ "metric": metric.__name__,
422
+ "dataset": self.dataset.name,
423
+ "configuration": {
424
+ "prompt": current_prompt,
425
+ "rounds": self.rounds,
426
+ "num_prompts_per_round": self.num_prompts_per_round,
427
+ },
428
+ },
429
+ }
430
+
431
+ with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
432
+ initial_score = self.evaluate_prompt(
433
+ prompt=prompt,
434
+ optimization_id=optimization_id,
435
+ dataset=dataset,
436
+ metric=metric,
437
+ n_samples=n_samples,
438
+ experiment_config=experiment_config,
439
+ use_full_dataset=n_samples is None,
440
+ verbose=self.verbose,
441
+ )
442
+ best_score = initial_score
443
+ best_prompt = current_prompt
444
+ rounds = []
445
+
446
+ baseline_reporter.set_score(initial_score)
447
+
448
+ reporting.display_optimization_start_message(verbose=self.verbose)
449
+ with reporting.display_round_progress(self.rounds, verbose=self.verbose) as round_reporter:
450
+ for round_num in range(self.rounds):
451
+
452
+ round_reporter.round_start(round_num)
453
+ previous_best_score = best_score
454
+
455
+ # Step 1. Create a set of candidate prompts
456
+ try:
457
+ candidate_prompts = self._generate_candidate_prompts(
458
+ current_prompt=best_prompt,
459
+ best_score=best_score,
460
+ round_num=round_num,
461
+ previous_rounds=rounds,
462
+ metric=metric,
463
+ optimization_id=optimization_id,
464
+ )
465
+ except Exception as e:
466
+ round_reporter.failed_to_generate(self.num_prompts_per_round, e)
467
+ continue
468
+
469
+ # Step 2. Score each candidate prompt
470
+ prompt_scores = []
471
+ for candidate_count, prompt in enumerate(candidate_prompts):
472
+ with reporting.display_prompt_candidate_scoring_report(candidate_count, prompt, verbose=self.verbose) as eval_report:
473
+ eval_report.set_generated_prompts(candidate_count, prompt)
474
+
475
+ try:
476
+ prompt_score = self.evaluate_prompt(
477
+ prompt=chat_prompt.ChatPrompt(messages=prompt),
478
+ optimization_id=optimization_id,
479
+ dataset=dataset,
480
+ metric=metric,
481
+ n_samples=n_samples,
482
+ use_full_dataset=False,
483
+ experiment_config=experiment_config,
484
+ verbose=self.verbose,
485
+ )
486
+
487
+ eval_report.set_final_score(best_score, prompt_score)
488
+ except Exception as e:
489
+ raise ValueError(f"Error evaluating candidate prompt: {e}")
490
+
491
+ prompt_scores.append((prompt, prompt_score))
492
+
493
+ # Step 3. Identify potential improvements
494
+ if not prompt_scores:
495
+ logger.warning("No prompts were successfully evaluated in this round")
496
+ break
497
+
498
+ prompt_scores.sort(key=lambda x: x[1], reverse=True)
499
+ best_candidate_this_round, best_cand_score_avg = (
500
+ prompt_scores[0]
501
+ )
502
+ improvement = self._calculate_improvement(best_cand_score_avg, best_score)
503
+ round_reporter.round_end(round_num, best_cand_score_avg, best_score, best_prompt)
504
+
505
+ round_data = self._create_round_data(
506
+ round_num=round_num,
507
+ current_best_prompt=chat_prompt.ChatPrompt(messages=best_candidate_this_round),
508
+ current_best_score=best_cand_score_avg,
509
+ best_prompt_overall=chat_prompt.ChatPrompt(messages=best_prompt),
510
+ evaluated_candidates=prompt_scores,
511
+ previous_best_score=previous_best_score,
512
+ improvement_this_round=improvement,
513
+ )
514
+ rounds.append(round_data)
515
+ self._add_to_history(round_data.model_dump())
516
+
517
+ if improvement > 0:
518
+ best_score = best_cand_score_avg
519
+ best_prompt = best_candidate_this_round
520
+
521
+ reporting.display_result(
522
+ initial_score,
523
+ best_score,
524
+ best_prompt,
525
+ verbose=self.verbose
526
+ )
527
+
528
+ return self._create_result(
529
+ metric,
530
+ prompt,
531
+ best_prompt,
532
+ best_score,
533
+ initial_score,
534
+ rounds,
535
+ )
536
+
537
+ def _calculate_improvement(
538
+ self, current_score: float, previous_score: float
539
+ ) -> float:
540
+ """Calculate the improvement percentage between scores."""
541
+ return (
542
+ (current_score - previous_score) / previous_score
543
+ if previous_score > 0
544
+ else 0
545
+ )
546
+
547
+ def _create_round_data(
548
+ self,
549
+ round_num: int,
550
+ current_best_prompt: chat_prompt.ChatPrompt,
551
+ current_best_score: float,
552
+ best_prompt_overall: chat_prompt.ChatPrompt,
553
+ evaluated_candidates: List[tuple[str, float, List[float]]],
554
+ previous_best_score: float,
555
+ improvement_this_round: float,
556
+ ) -> OptimizationRound:
557
+ """Create an OptimizationRound object with the current round's data."""
558
+ generated_prompts_log = []
559
+ for prompt, score in evaluated_candidates:
560
+ improvement_vs_prev = self._calculate_improvement(
561
+ score, previous_best_score
562
+ )
563
+ generated_prompts_log.append(
564
+ {
565
+ "prompt": prompt,
566
+ "score": score,
567
+ "improvement": improvement_vs_prev,
568
+ }
569
+ )
570
+
571
+ return OptimizationRound(
572
+ round_number=round_num + 1,
573
+ current_prompt=current_best_prompt,
574
+ current_score=current_best_score,
575
+ generated_prompts=generated_prompts_log,
576
+ best_prompt=best_prompt_overall,
577
+ best_score=current_best_score,
578
+ improvement=improvement_this_round,
579
+ )
580
+
581
+ def _create_result(
582
+ self,
583
+ metric: Callable,
584
+ prompt: chat_prompt.ChatPrompt,
585
+ best_prompt: str,
586
+ best_score: float,
587
+ initial_score: float,
588
+ rounds: List[OptimizationRound],
589
+ ) -> OptimizationResult:
590
+ """Create the final OptimizationResult object."""
591
+ details = {
592
+ "initial_prompt": prompt,
593
+ "initial_score": initial_score,
594
+ "final_prompt": best_prompt,
595
+ "final_score": best_score,
596
+ "rounds": rounds,
597
+ "total_rounds": len(rounds),
598
+ "metric_name": metric.__name__,
599
+ "model": self.model,
600
+ "temperature": self.model_kwargs.get("temperature"),
601
+ }
602
+
603
+ return OptimizationResult(
604
+ optimizer=self.__class__.__name__,
605
+ prompt=best_prompt,
606
+ score=best_score,
607
+ metric_name=metric.__name__,
608
+ details=details,
609
+ llm_calls=self.llm_call_counter
610
+ )
611
+
612
+ def _get_task_context(self, metric: Callable) -> str:
613
+ """Get task-specific context from the dataset and metric configuration."""
614
+ if self.dataset is None:
615
+ return ""
616
+
617
+ try:
618
+ # Try get_items() first as it's the preferred method
619
+ items = self.dataset.get_items()
620
+ sample = items[0] # Get first sample
621
+ except Exception as e:
622
+ logger.warning(f"Could not get sample from dataset: {e}")
623
+
624
+ # Describe Single Metric
625
+ if sample is not None:
626
+ metric_name = metric.__name__
627
+ description = metric.__doc__ or "No description available."
628
+
629
+ metrics_str = f"- {metric_name}: {description}"
630
+
631
+ context = "\nTask Context:\n"
632
+ context += f"Dataset fields (includes both input and optionally the expected output): {', '.join([x for x in sample.keys() if x != 'id'])}\n"
633
+ context += f"Evaluation Metric:\n{metrics_str}\n"
634
+ context += f"\nExample:\n{json.dumps(sample)}\n"
635
+
636
+ return context
637
+
638
+ def _generate_candidate_prompts(
639
+ self,
640
+ current_prompt: chat_prompt.ChatPrompt,
641
+ best_score: float,
642
+ round_num: int,
643
+ previous_rounds: List[OptimizationRound],
644
+ metric: Callable,
645
+ optimization_id: Optional[str] = None,
646
+ ) -> List[str]:
647
+ """Generate candidate prompts using meta-prompting."""
648
+ with reporting.display_candidate_generation_report(
649
+ self.num_prompts_per_round,
650
+ verbose=self.verbose
651
+ ) as candidate_generation_report:
652
+ logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
653
+ logger.debug(f"Generating from prompt: {current_prompt}")
654
+ logger.debug(f"Current best score: {best_score:.4f}")
655
+
656
+ history_context = self._build_history_context(previous_rounds)
657
+ task_context_str = ""
658
+ analysis_instruction = ""
659
+ metric_focus_instruction = ""
660
+ improvement_point_1 = ""
661
+
662
+ if self.enable_context:
663
+ task_context_str = self._get_task_context(metric=metric)
664
+ analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
665
+ metric_focus_instruction = f"Focus on improving the score for the metric: {metric.__name__}."
666
+ improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
667
+ logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
668
+ else:
669
+ analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
670
+ metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
671
+ improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
672
+ logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
673
+
674
+ user_prompt = f"""Current prompt: {current_prompt}
675
+ Current score: {best_score}
676
+ {history_context}
677
+ {task_context_str}
678
+
679
+ {analysis_instruction}
680
+ Generate {self.num_prompts_per_round} improved versions of this prompt.
681
+ {metric_focus_instruction}
682
+ Each version should aim to:
683
+ {improvement_point_1}
684
+ 2. Provide necessary context and constraints (if applicable, without relying on disabled external context).
685
+ 3. Guide the model to produce the desired output format suitable for the task.
686
+ 4. Remove ambiguity and unnecessary elements.
687
+ 5. Maintain conciseness while being complete.
688
+
689
+ Return a valid JSON array as specified."""
690
+
691
+ try:
692
+ # Use _call_model which handles selecting reasoning_model
693
+ content = self._call_model(
694
+ messages=[
695
+ {"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
696
+ {"role": "user", "content": user_prompt}
697
+ ],
698
+ is_reasoning=True,
699
+ optimization_id=optimization_id,
700
+ )
701
+ logger.debug(f"Raw response from reasoning model: {content}")
702
+
703
+ # --- Robust JSON Parsing and Validation ---
704
+ json_result = None
705
+ try:
706
+ # Try direct JSON parsing
707
+ json_result = json.loads(content)
708
+ except json.JSONDecodeError:
709
+ import re
710
+
711
+ json_match = re.search(r"\{.*\}", content, re.DOTALL)
712
+ if json_match:
713
+ try:
714
+ json_result = json.loads(json_match.group())
715
+ except json.JSONDecodeError as e:
716
+ raise ValueError(f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}")
717
+ else:
718
+ raise ValueError(f"No JSON object found in response via regex. - received: {content}")
719
+
720
+ # Validate the parsed JSON structure
721
+ if isinstance(json_result, list) and len(json_result) == 1:
722
+ json_result = json_result[0]
723
+
724
+ if not isinstance(json_result, dict) or "prompts" not in json_result:
725
+ logger.debug(f"Parsed JSON content: {json_result}")
726
+ raise ValueError(f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}")
727
+
728
+ if not isinstance(json_result["prompts"], list):
729
+ logger.debug(f"Content of 'prompts': {json_result.get('prompts')}")
730
+ raise ValueError(f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}")
731
+
732
+ # Extract and log valid prompts
733
+ valid_prompts = []
734
+ for item in json_result["prompts"]:
735
+ if (
736
+ isinstance(item, dict)
737
+ and "prompt" in item
738
+ and isinstance(item["prompt"], list)
739
+ ):
740
+ prompt_text = item["prompt"]
741
+ valid_prompts.append(prompt_text)
742
+
743
+ # Log details
744
+ focus = item.get("improvement_focus", "N/A")
745
+ reasoning = item.get("reasoning", "N/A")
746
+ logger.debug(f"Generated prompt: {prompt_text}")
747
+ logger.debug(f" Improvement focus: {focus}")
748
+ logger.debug(f" Reasoning: {reasoning}")
749
+ else:
750
+ logger.warning(
751
+ f"Skipping invalid prompt item structure in JSON response: {item}"
752
+ )
753
+
754
+ if not valid_prompts:
755
+ raise ValueError("No valid prompts found in the parsed JSON response after validation.")
756
+
757
+ candidate_generation_report.set_generated_prompts(
758
+ self.num_prompts_per_round
759
+ )
760
+
761
+ return valid_prompts
762
+ # --- End Robust Parsing ---
763
+
764
+ except Exception as e:
765
+ raise ValueError(f"Unexpected error during candidate prompt generation: {e}")
766
+
767
+ def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
768
+ """Build context from previous optimization rounds."""
769
+ if not previous_rounds:
770
+ return ""
771
+
772
+ context = "\nPrevious rounds (latest first):\n"
773
+ for round_data in reversed(previous_rounds[-3:]):
774
+ context += f"\nRound {round_data.round_number}:\n"
775
+ context += f"Best score this round: {round_data.best_score:.4f}\n"
776
+ context += "Generated prompts this round (best first):\n"
777
+
778
+ sorted_generated = sorted(
779
+ round_data.generated_prompts,
780
+ key=lambda p: p.get("score", -float("inf")),
781
+ reverse=True,
782
+ )
783
+
784
+ for p in sorted_generated[:3]:
785
+ prompt_text = p.get("prompt", "N/A")
786
+ score = p.get("score", float("nan"))
787
+ context += f"- Prompt: {prompt_text[:150]}...\n"
788
+ context += f" Avg Score: {score:.4f}\n"
789
+ return context
790
+
791
+ def _get_evaluation_subset(
792
+ self, dataset: opik.Dataset, min_size: int = 20, max_size: int = 100
793
+ ) -> List[Dict[str, Any]]:
794
+ """Get a random subset of the dataset for evaluation.
795
+
796
+ Returns:
797
+ List[Dict[str, Any]]: A list of dataset items to evaluate against
798
+ """
799
+ try:
800
+ # Get all items from the dataset
801
+ all_items = dataset.get_items()
802
+ if not all_items:
803
+ return all_items
804
+
805
+ # Calculate subset size
806
+ total_size = len(all_items)
807
+ subset_size = min(max(min_size, int(total_size * 0.2)), max_size)
808
+
809
+ # Get random subset of items
810
+ import random
811
+
812
+ return random.sample(all_items, subset_size)
813
+
814
+ except Exception as e:
815
+ logger.warning(f"Could not create evaluation subset: {e}")
816
+ return all_items