opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. opik_optimizer/__init__.py +15 -26
  2. opik_optimizer/base_optimizer.py +28 -44
  3. opik_optimizer/data/hotpot-500.json +501 -1001
  4. opik_optimizer/datasets/__init__.py +6 -7
  5. opik_optimizer/datasets/hotpot_qa.py +2 -1
  6. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
  7. opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
  8. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
  9. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
  10. opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
  11. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
  12. opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
  13. opik_optimizer/mipro_optimizer/__init__.py +1 -1
  14. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
  15. opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
  16. opik_optimizer/mipro_optimizer/utils.py +1 -23
  17. opik_optimizer/optimization_config/chat_prompt.py +106 -0
  18. opik_optimizer/optimization_config/configs.py +2 -21
  19. opik_optimizer/optimization_config/mappers.py +1 -1
  20. opik_optimizer/optimization_result.py +57 -85
  21. opik_optimizer/reporting_utils.py +180 -0
  22. opik_optimizer/task_evaluator.py +41 -26
  23. opik_optimizer/utils.py +187 -3
  24. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
  25. opik_optimizer-0.9.0.dist-info/RECORD +48 -0
  26. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
  27. opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
  28. opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
  29. opik_optimizer/integrations/__init__.py +0 -0
  30. opik_optimizer/meta_prompt_optimizer.py +0 -1151
  31. opik_optimizer-0.8.0.dist-info/RECORD +0 -45
  32. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
  33. {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,1151 +0,0 @@
1
- from typing import List, Dict, Any, Optional, Union
2
- import opik
3
- from opik import Dataset
4
- import litellm
5
- from litellm.caching import Cache
6
- import logging
7
- import json
8
- import os
9
- from string import Template
10
-
11
- from .optimization_config import mappers
12
- from .optimization_config.configs import MetricConfig, TaskConfig
13
- from .base_optimizer import BaseOptimizer, OptimizationRound
14
- from .optimization_result import OptimizationResult
15
- from opik_optimizer import task_evaluator
16
- from opik.api_objects import opik_client
17
- from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
18
- from opik.environment import get_tqdm_for_current_environment
19
- from . import _throttle
20
-
21
- tqdm = get_tqdm_for_current_environment()
22
-
23
- # Using disk cache for LLM calls
24
- disk_cache_dir = os.path.expanduser("~/.litellm_cache")
25
- litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
26
-
27
- # Set up logging
28
- logger = logging.getLogger(__name__) # Gets logger configured by setup_logging
29
-
30
- _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
31
-
32
-
33
- class MetaPromptOptimizer(BaseOptimizer):
34
- """Optimizer that uses meta-prompting to improve prompts based on examples and performance."""
35
-
36
- # --- Constants for Default Configuration ---
37
- DEFAULT_MAX_ROUNDS = 3
38
- DEFAULT_PROMPTS_PER_ROUND = 4
39
- DEFAULT_IMPROVEMENT_THRESHOLD = 0.05
40
- DEFAULT_INITIAL_TRIALS = 3
41
- DEFAULT_MAX_TRIALS = 6
42
- DEFAULT_ADAPTIVE_THRESHOLD = 0.8 # Set to None to disable adaptive trials
43
-
44
- # --- Reasoning System Prompt ---
45
- _REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
46
- Focus on making the prompt more effective by:
47
- 1. Being clear and specific about what is expected
48
- 2. Providing necessary context and constraints
49
- 3. Guiding the model to produce the desired output format
50
- 4. Removing ambiguity and unnecessary elements
51
- 5. Maintaining conciseness while being complete
52
-
53
- Return a JSON array of prompts with the following structure:
54
- {
55
- "prompts": [
56
- {
57
- "prompt": "the improved prompt text",
58
- "improvement_focus": "what aspect this prompt improves",
59
- "reasoning": "why this improvement should help"
60
- }
61
- ]
62
- }"""
63
-
64
- # --- Constants for Default Configuration ---
65
- DEFAULT_MAX_ROUNDS = 3
66
- DEFAULT_PROMPTS_PER_ROUND = 4
67
- DEFAULT_IMPROVEMENT_THRESHOLD = 0.05
68
- DEFAULT_INITIAL_TRIALS = 3
69
- DEFAULT_MAX_TRIALS = 6
70
- DEFAULT_ADAPTIVE_THRESHOLD = 0.8 # Set to None to disable adaptive trials
71
-
72
- # --- Reasoning System Prompt ---
73
- _REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
74
- Focus on making the prompt more effective by:
75
- 1. Being clear and specific about what is expected
76
- 2. Providing necessary context and constraints
77
- 3. Guiding the model to produce the desired output format
78
- 4. Removing ambiguity and unnecessary elements
79
- 5. Maintaining conciseness while being complete
80
-
81
- Return a JSON array of prompts with the following structure:
82
- {
83
- "prompts": [
84
- {
85
- "prompt": "the improved prompt text",
86
- "improvement_focus": "what aspect this prompt improves",
87
- "reasoning": "why this improvement should help"
88
- }
89
- ]
90
- }"""
91
-
92
- def __init__(
93
- self,
94
- model: str,
95
- reasoning_model: str = None,
96
- max_rounds: int = DEFAULT_MAX_ROUNDS,
97
- num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
98
- improvement_threshold: float = DEFAULT_IMPROVEMENT_THRESHOLD,
99
- initial_trials_per_candidate: int = DEFAULT_INITIAL_TRIALS,
100
- max_trials_per_candidate: int = DEFAULT_MAX_TRIALS,
101
- adaptive_trial_threshold: Optional[float] = DEFAULT_ADAPTIVE_THRESHOLD,
102
- num_threads: int = 12,
103
- project_name: Optional[str] = None,
104
- verbose: int = 1,
105
- enable_context: bool = True,
106
- **model_kwargs,
107
- ):
108
- """
109
- Initialize the MetaPromptOptimizer.
110
-
111
- Args:
112
- model: The model to use for evaluation
113
- reasoning_model: The model to use for reasoning and prompt generation
114
- max_rounds: Maximum number of optimization rounds
115
- num_prompts_per_round: Number of prompts to generate per round
116
- improvement_threshold: Minimum improvement required to continue
117
- initial_trials_per_candidate: Number of initial evaluation trials for each candidate prompt.
118
- max_trials_per_candidate: Maximum number of evaluation trials if adaptive trials are enabled and score is promising.
119
- adaptive_trial_threshold: If not None, prompts scoring below `best_score * adaptive_trial_threshold` after initial trials won't get max trials.
120
- num_threads: Number of threads for parallel evaluation
121
- project_name: Optional project name for tracking
122
- verbose: Controls internal logging/progress bars (0=off, 1=on).
123
- enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
124
- **model_kwargs: Additional model parameters
125
- """
126
- super().__init__(model=model, project_name=project_name, **model_kwargs)
127
- self.reasoning_model = reasoning_model if reasoning_model is not None else model
128
- self.max_rounds = max_rounds
129
- self.num_prompts_per_round = num_prompts_per_round
130
- self.improvement_threshold = improvement_threshold
131
- self.initial_trials = initial_trials_per_candidate
132
- self.max_trials = max_trials_per_candidate
133
- self.adaptive_threshold = adaptive_trial_threshold
134
- self.num_threads = num_threads
135
- self.verbose = verbose
136
- self.dataset = None
137
- self.task_config = None
138
- self._opik_client = opik_client.get_client_cached()
139
- self.llm_call_counter = 0
140
- self.enable_context = enable_context
141
- logger.debug(
142
- f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
143
- )
144
- logger.debug(
145
- f"Optimization rounds: {max_rounds}, Prompts/round: {num_prompts_per_round}"
146
- )
147
- logger.debug(
148
- f"Trials config: Initial={self.initial_trials}, Max={self.max_trials}, Adaptive Threshold={self.adaptive_threshold}"
149
- )
150
-
151
- def evaluate_prompt(
152
- self,
153
- dataset: opik.Dataset,
154
- metric_config: MetricConfig,
155
- task_config: TaskConfig,
156
- prompt: str,
157
- use_full_dataset: bool = False,
158
- experiment_config: Optional[Dict] = None,
159
- n_samples: Optional[int] = None,
160
- optimization_id: Optional[str] = None,
161
- verbose: int = 1,
162
- ) -> float:
163
- """
164
- Evaluate a prompt using the given dataset and metric configuration.
165
-
166
- Args:
167
- dataset: The dataset to evaluate against
168
- metric_config: The metric configuration to use for evaluation
169
- task_config: The task configuration containing input/output fields
170
- prompt: The prompt to evaluate
171
- use_full_dataset: Whether to use the full dataset or a subset for evaluation
172
- experiment_config: A dictionary to log with the experiments
173
- n_samples: The number of dataset items to use for evaluation
174
- optimization_id: Optional ID for tracking the optimization run
175
-
176
- Returns:
177
- float: The evaluation score
178
- """
179
- return self._evaluate_prompt(
180
- dataset=dataset,
181
- metric_config=metric_config,
182
- task_config=task_config,
183
- prompt=prompt,
184
- use_full_dataset=use_full_dataset,
185
- experiment_config=experiment_config,
186
- n_samples=n_samples,
187
- optimization_id=optimization_id,
188
- verbose=self.verbose,
189
- )
190
-
191
- @_throttle.rate_limited(_rate_limiter)
192
- def _call_model(
193
- self,
194
- prompt: str,
195
- system_prompt: Optional[str] = None,
196
- is_reasoning: bool = False,
197
- optimization_id: Optional[str] = None,
198
- ) -> str:
199
- """Call the model with the given prompt and return the response."""
200
- self.llm_call_counter += 1
201
- # Note: Basic retry logic could be added here using tenacity
202
- try:
203
- # Basic LLM parameters (e.g., temperature, max_tokens)
204
- base_temperature = getattr(self, "temperature", 0.3)
205
- base_max_tokens = getattr(self, "max_tokens", 1000)
206
-
207
- # Use potentially different settings for reasoning calls
208
- reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
209
- # Increase max_tokens for reasoning to ensure JSON fits, unless already high
210
- reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
211
-
212
- llm_config_params = {
213
- "temperature": reasoning_temperature if is_reasoning else base_temperature,
214
- "max_tokens": reasoning_max_tokens,
215
- "top_p": getattr(self, "top_p", 1.0),
216
- "frequency_penalty": getattr(self, "frequency_penalty", 0.0),
217
- "presence_penalty": getattr(self, "presence_penalty", 0.0),
218
- }
219
-
220
- # Prepare metadata that we want to be part of the LLM call context.
221
- metadata_for_opik = {}
222
- if self.project_name:
223
- metadata_for_opik["project_name"] = (
224
- self.project_name
225
- ) # Top-level for general use
226
- metadata_for_opik["opik"] = {"project_name": self.project_name}
227
-
228
- if optimization_id:
229
- # Also add to opik-specific structure if project_name was added
230
- if "opik" in metadata_for_opik:
231
- metadata_for_opik["opik"]["optimization_id"] = optimization_id
232
-
233
- metadata_for_opik["optimizer_name"] = self.__class__.__name__
234
- metadata_for_opik["opik_call_type"] = (
235
- "reasoning" if is_reasoning else "evaluation_llm_task_direct"
236
- )
237
-
238
- if metadata_for_opik:
239
- llm_config_params["metadata"] = metadata_for_opik
240
-
241
- messages = []
242
- if system_prompt and (
243
- is_reasoning or getattr(self.task_config, "use_chat_prompt", False)
244
- ):
245
- messages.append({"role": "system", "content": system_prompt})
246
- messages.append({"role": "user", "content": prompt})
247
-
248
- model_to_use = self.reasoning_model if is_reasoning else self.model
249
-
250
- # Pass llm_config_params (which now includes our metadata) to the Opik monitor.
251
- # The monitor is expected to return a dictionary suitable for spreading into litellm.completion,
252
- # having handled our metadata and added any Opik-specific configurations.
253
- final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
254
- llm_config_params.copy()
255
- )
256
-
257
- logger.debug(
258
- f"Calling model '{model_to_use}' with messages: {messages}, "
259
- f"final params for litellm (from monitor): {final_call_params}"
260
- )
261
-
262
- response = litellm.completion(
263
- model=model_to_use,
264
- messages=messages,
265
- num_retries=6,
266
- **final_call_params
267
- )
268
- return response.choices[0].message.content
269
- except litellm.exceptions.RateLimitError as e:
270
- logger.error(f"LiteLLM Rate Limit Error: {e}")
271
- raise
272
- except litellm.exceptions.APIConnectionError as e:
273
- logger.error(f"LiteLLM API Connection Error: {e}")
274
- raise
275
- except litellm.exceptions.ContextWindowExceededError as e:
276
- logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
277
- # Log prompt length if possible? Needs access to prompt_for_llm here.
278
- raise
279
- except Exception as e:
280
- logger.error(
281
- f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
282
- )
283
- raise
284
-
285
- def _evaluate_prompt(
286
- self,
287
- dataset: opik.Dataset,
288
- metric_config: MetricConfig,
289
- task_config: TaskConfig,
290
- prompt: str,
291
- use_full_dataset: bool,
292
- experiment_config: Optional[Dict],
293
- n_samples: Optional[int],
294
- optimization_id: Optional[str] = None,
295
- verbose: int = 1,
296
- ) -> float:
297
- # Calculate subset size for trials
298
- if not use_full_dataset:
299
- total_items = len(dataset.get_items())
300
- if n_samples is not None:
301
- if n_samples > total_items:
302
- logger.warning(
303
- f"Requested n_samples ({n_samples}) is larger than dataset size ({total_items}). Using full dataset."
304
- )
305
- subset_size = None
306
- else:
307
- subset_size = n_samples
308
- logger.debug(f"Using specified n_samples: {subset_size} items")
309
- else:
310
- # Calculate 20% of total, but no more than 20 items and no more than total items
311
- subset_size = min(total_items, min(20, max(10, int(total_items * 0.2))))
312
- logger.debug(
313
- f"Using automatic subset size calculation: {subset_size} items (20% of {total_items} total items)"
314
- )
315
- else:
316
- subset_size = None # Use all items for final checks
317
- logger.debug("Using full dataset for evaluation")
318
- experiment_config = experiment_config or {}
319
- experiment_config = {
320
- **experiment_config,
321
- **{
322
- "optimizer": self.__class__.__name__,
323
- "metric": metric_config.metric.name,
324
- "dataset": dataset.name,
325
- "configuration": {
326
- "prompt": prompt,
327
- "n_samples": subset_size,
328
- "use_full_dataset": use_full_dataset,
329
- },
330
- },
331
- }
332
-
333
- def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
334
- # Convert DatasetItem to dict if needed
335
- if hasattr(dataset_item, "to_dict"):
336
- dataset_item = dataset_item.to_dict()
337
-
338
- # Validate that input and output fields are in the dataset_item
339
- for input_key in task_config.input_dataset_fields:
340
- if input_key not in dataset_item:
341
- logger.error(
342
- f"Input field '{input_key}' not found in dataset sample: {dataset_item}"
343
- )
344
- raise ValueError(
345
- f"Input field '{input_key}' not found in dataset sample"
346
- )
347
- if task_config.output_dataset_field not in dataset_item:
348
- logger.error(
349
- f"Output field '{task_config.output_dataset_field}' not found in dataset sample: {dataset_item}"
350
- )
351
- raise ValueError(
352
- f"Output field '{task_config.output_dataset_field}' not found in dataset sample"
353
- )
354
-
355
- # --- Step 1: Prepare the prompt for the LLM ---
356
- prompt_for_llm: str
357
- field_mapping = {
358
- field: dataset_item[field]
359
- for field in task_config.input_dataset_fields
360
- if field in dataset_item
361
- }
362
-
363
- if getattr(task_config, "use_chat_prompt", False):
364
- # For chat prompts, the candidate prompt `prompt` is expected to be a template for the user message.
365
- # We assume it contains placeholders like {question} or {text}.
366
- candidate_template = Template(prompt)
367
- prompt_for_llm = candidate_template.safe_substitute(field_mapping)
368
- else:
369
- # For non-chat prompts, `prompt` (the candidate/initial prompt) is the base instruction.
370
- # Append the actual data fields to it.
371
- input_clauses = []
372
- for field_name in task_config.input_dataset_fields:
373
- if field_name in dataset_item:
374
- input_clauses.append(
375
- f"{field_name.capitalize()}: {dataset_item[field_name]}"
376
- )
377
- item_specific_inputs_str = "\n".join(input_clauses)
378
- prompt_for_llm = f"{prompt}\n\n{item_specific_inputs_str}"
379
-
380
- logger.debug(f"Evaluating with inputs: {field_mapping}")
381
- logger.debug(f"Prompt for LLM: {prompt_for_llm}")
382
-
383
- # --- Step 2: Call the model ---
384
- try:
385
- logger.debug(f"Calling LLM with prompt length: {len(prompt_for_llm)}")
386
- raw_model_output = self._call_model(
387
- prompt=prompt_for_llm,
388
- system_prompt=None,
389
- is_reasoning=False,
390
- optimization_id=optimization_id,
391
- )
392
- logger.debug(f"LLM raw response length: {len(raw_model_output)}")
393
- logger.debug(f"LLM raw output: {raw_model_output}")
394
- except Exception as e:
395
- logger.error(f"Error calling model with prompt: {e}")
396
- logger.error(f"Failed prompt: {prompt_for_llm}")
397
- logger.error(f"Prompt length: {len(prompt_for_llm)}")
398
- raise
399
-
400
- # --- Step 3: Clean the model's output before metric evaluation ---
401
- cleaned_model_output = raw_model_output.strip()
402
- original_cleaned_output = cleaned_model_output # For logging if changed
403
-
404
- # Dynamically generate prefixes based on the output field name
405
- output_field = task_config.output_dataset_field # e.g., "answer" or "label"
406
- dynamic_prefixes = [
407
- f"{output_field.capitalize()}:",
408
- f"{output_field.capitalize()} :",
409
- f"{output_field}:", # Also check lowercase field name
410
- f"{output_field} :",
411
- ]
412
-
413
- # Add common generic prefixes
414
- generic_prefixes = ["Answer:", "Answer :", "A:"]
415
-
416
- # Combine and remove duplicates (if any)
417
- prefixes_to_strip = list(set(dynamic_prefixes + generic_prefixes))
418
- logger.debug(f"Prefixes to strip: {prefixes_to_strip}")
419
-
420
- for prefix_to_check in prefixes_to_strip:
421
- # Perform case-insensitive check for robustness
422
- if cleaned_model_output.lower().startswith(prefix_to_check.lower()):
423
- # Strip based on the actual length of the found prefix
424
- cleaned_model_output = cleaned_model_output[
425
- len(prefix_to_check) :
426
- ].strip()
427
- logger.debug(
428
- f"Stripped prefix '{prefix_to_check}', new output for metric: {cleaned_model_output}"
429
- )
430
- break # Stop after stripping the first found prefix
431
-
432
- if original_cleaned_output != cleaned_model_output:
433
- logger.debug(
434
- f"Raw model output: '{original_cleaned_output}' -> Cleaned for metric: '{cleaned_model_output}'"
435
- )
436
- result = {
437
- mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
438
- }
439
- return result
440
-
441
- # Use dataset's get_items with limit for sampling
442
- logger.info(
443
- f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {metric_config.metric.name}"
444
- )
445
- score = task_evaluator.evaluate(
446
- dataset=dataset,
447
- metric_config=metric_config,
448
- evaluated_task=llm_task,
449
- num_threads=self.num_threads,
450
- project_name=self.project_name,
451
- n_samples=subset_size, # Use subset_size for trials, None for full dataset
452
- experiment_config=experiment_config,
453
- optimization_id=optimization_id,
454
- verbose=self.verbose,
455
- )
456
- logger.debug(f"Evaluation score: {score:.4f}")
457
- return score
458
-
459
- def optimize_prompt(
460
- self,
461
- dataset: Union[str, Dataset],
462
- metric_config: MetricConfig,
463
- task_config: TaskConfig,
464
- experiment_config: Optional[Dict] = None,
465
- n_samples: int = None,
466
- auto_continue: bool = False,
467
- **kwargs,
468
- ) -> OptimizationResult:
469
- """
470
- Optimize a prompt using meta-reasoning.
471
-
472
- Args:
473
- dataset: The dataset to evaluate against
474
- metric_config: The metric configuration to use for evaluation
475
- task_config: The task configuration containing input/output fields
476
- experiment_config: A dictionary to log with the experiments
477
- n_samples: The number of dataset items to use for evaluation
478
- auto_continue: If True, the algorithm may continue if goal not met
479
- **kwargs: Additional arguments for evaluation
480
-
481
- Returns:
482
- OptimizationResult: Structured result containing optimization details
483
- """
484
- total_items = len(dataset.get_items())
485
- if n_samples is not None and n_samples > total_items:
486
- logger.warning(
487
- f"Requested n_samples ({n_samples}) is larger than dataset size ({total_items}). Using full dataset."
488
- )
489
- n_samples = None
490
-
491
- logger.info(
492
- f"Starting optimization with n_samples={n_samples}, auto_continue={auto_continue}"
493
- )
494
- logger.info(f"Dataset size: {total_items} items")
495
- logger.info(f"Initial prompt: {task_config.instruction_prompt}")
496
-
497
- optimization = None
498
- try:
499
- optimization = self._opik_client.create_optimization(
500
- dataset_name=dataset.name,
501
- objective_name=metric_config.metric.name,
502
- metadata={"optimizer": self.__class__.__name__},
503
- )
504
- logger.info(f"Created optimization with ID: {optimization.id}")
505
- except Exception as e:
506
- logger.warning(
507
- f"Opik server does not support optimizations: {e}. Please upgrade opik."
508
- )
509
- optimization = None
510
-
511
- try:
512
- result = self._optimize_prompt(
513
- optimization_id=optimization.id if optimization is not None else None,
514
- dataset=dataset,
515
- metric_config=metric_config,
516
- task_config=task_config,
517
- experiment_config=experiment_config,
518
- n_samples=n_samples,
519
- auto_continue=auto_continue,
520
- **kwargs,
521
- )
522
- if optimization:
523
- self.update_optimization(optimization, status="completed")
524
- logger.info("Optimization completed successfully")
525
- return result
526
- except Exception as e:
527
- logger.error(f"Optimization failed: {e}")
528
- if optimization:
529
- self.update_optimization(optimization, status="cancelled")
530
- logger.info("Optimization marked as cancelled")
531
- raise e
532
-
533
- def _optimize_prompt(
534
- self,
535
- optimization_id: str,
536
- dataset: Union[str, Dataset],
537
- metric_config: MetricConfig,
538
- task_config: TaskConfig,
539
- experiment_config: Optional[Dict],
540
- n_samples: int,
541
- auto_continue: bool,
542
- **kwargs,
543
- ) -> OptimizationResult:
544
- self.auto_continue = auto_continue
545
- self.dataset = dataset
546
- self.task_config = task_config
547
- self.llm_call_counter = 0 # Reset counter for run
548
-
549
- current_prompt = task_config.instruction_prompt
550
- experiment_config = experiment_config or {}
551
- experiment_config = {
552
- **experiment_config,
553
- **{
554
- "optimizer": self.__class__.__name__,
555
- "metric": metric_config.metric.name,
556
- "dataset": self.dataset.name,
557
- "configuration": {
558
- "prompt": current_prompt,
559
- "max_rounds": self.max_rounds,
560
- "num_prompts_per_round": self.num_prompts_per_round,
561
- "improvement_threshold": self.improvement_threshold,
562
- "initial_trials": self.initial_trials,
563
- "max_trials": self.max_trials,
564
- "adaptive_threshold": self.adaptive_threshold,
565
- },
566
- },
567
- }
568
-
569
- logger.info("Evaluating initial prompt")
570
- initial_score = self.evaluate_prompt(
571
- optimization_id=optimization_id,
572
- dataset=dataset,
573
- metric_config=metric_config,
574
- task_config=task_config,
575
- prompt=current_prompt,
576
- n_samples=n_samples,
577
- experiment_config=experiment_config,
578
- use_full_dataset=n_samples is None,
579
- verbose=self.verbose,
580
- )
581
- best_score = initial_score
582
- best_prompt = current_prompt
583
- rounds = []
584
- stopped_early = False
585
-
586
- logger.info(f"Initial score: {initial_score:.4f}")
587
-
588
- # Initialize TQDM with postfix placeholder
589
- pbar = tqdm(
590
- total=self.max_rounds,
591
- desc="Optimizing Prompt",
592
- unit="round",
593
- bar_format="{l_bar}{bar:20}{r_bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]",
594
- position=0,
595
- leave=True,
596
- postfix={
597
- "best_score": f"{initial_score:.4f}",
598
- "llm_calls": self.llm_call_counter,
599
- },
600
- )
601
-
602
- for round_num in range(self.max_rounds):
603
- logger.info(f"\n{'='*50}")
604
- logger.info(f"Starting Round {round_num + 1}/{self.max_rounds}")
605
- logger.info(f"Current best score: {best_score:.4f}")
606
- logger.info(f"Current best prompt: {best_prompt}")
607
-
608
- previous_best_score = best_score
609
- try:
610
- logger.info("Generating candidate prompts")
611
- candidate_prompts = self._generate_candidate_prompts(
612
- current_prompt=best_prompt,
613
- best_score=best_score,
614
- round_num=round_num,
615
- previous_rounds=rounds,
616
- metric_config=metric_config,
617
- optimization_id=optimization_id,
618
- )
619
- logger.info(f"Generated {len(candidate_prompts)} candidate prompts")
620
- except Exception as e:
621
- logger.error(f"Error generating candidate prompts: {e}")
622
- break
623
-
624
- prompt_scores = []
625
- for candidate_count, prompt in enumerate(candidate_prompts):
626
- logger.info(
627
- f"\nEvaluating candidate {candidate_count + 1}/{len(candidate_prompts)}"
628
- )
629
- logger.info(f"Prompt: {prompt}")
630
-
631
- scores = []
632
- should_run_max_trials = True
633
-
634
- # Initial trials
635
- logger.debug(f"Running initial {self.initial_trials} trials...")
636
- for trial in range(self.initial_trials):
637
- try:
638
- logger.debug(f"Trial {trial + 1}/{self.initial_trials}")
639
- score = self.evaluate_prompt(
640
- dataset=dataset,
641
- metric_config=metric_config,
642
- task_config=task_config,
643
- prompt=prompt,
644
- n_samples=n_samples,
645
- use_full_dataset=False,
646
- experiment_config=experiment_config,
647
- verbose=self.verbose,
648
- )
649
- scores.append(score)
650
- logger.debug(f"Trial {trial+1} score: {score:.4f}")
651
- except Exception as e:
652
- logger.error(f"Error in trial {trial + 1}: {e}")
653
- continue
654
-
655
- if not scores:
656
- logger.warning(
657
- "All initial trials failed for this prompt, skipping"
658
- )
659
- continue
660
-
661
- # Adaptive trials logic
662
- avg_score_initial = sum(scores) / len(scores)
663
- if (
664
- self.adaptive_threshold is not None
665
- and self.max_trials > self.initial_trials
666
- and avg_score_initial < best_score * self.adaptive_threshold
667
- ):
668
- should_run_max_trials = False
669
- logger.debug("Skipping additional trials...")
670
-
671
- # Run additional trials
672
- if should_run_max_trials and self.max_trials > self.initial_trials:
673
- num_additional_trials = self.max_trials - self.initial_trials
674
- logger.debug(
675
- f"Running {num_additional_trials} additional trials..."
676
- )
677
- for trial in range(self.initial_trials, self.max_trials):
678
- try:
679
- logger.debug(
680
- f"Additional trial {trial + 1}/{self.max_trials}"
681
- )
682
- score = self.evaluate_prompt(
683
- dataset=dataset,
684
- metric_config=metric_config,
685
- task_config=task_config,
686
- prompt=prompt,
687
- n_samples=n_samples,
688
- use_full_dataset=False,
689
- experiment_config=experiment_config,
690
- verbose=self.verbose,
691
- )
692
- scores.append(score)
693
- logger.debug(
694
- f"Additional trial {trial+1} score: {score:.4f}"
695
- )
696
- except Exception as e:
697
- logger.error(f"Error in additional trial {trial + 1}: {e}")
698
- continue
699
-
700
- # Calculate final average score
701
- if scores:
702
- final_avg_score = sum(scores) / len(scores)
703
- prompt_scores.append((prompt, final_avg_score, scores))
704
- logger.info(f"Completed {len(scores)} trials for prompt.")
705
- logger.info(f"Final average score: {final_avg_score:.4f}")
706
- logger.debug(
707
- f"Individual trial scores: {[f'{s:.4f}' for s in scores]}"
708
- )
709
- else:
710
- # This case should be rare now due to the initial check, but good practice
711
- logger.warning("No successful trials completed for this prompt.")
712
-
713
- if not prompt_scores:
714
- logger.warning("No prompts were successfully evaluated in this round")
715
- break
716
-
717
- # Sort by float score
718
- prompt_scores.sort(key=lambda x: x[1], reverse=True)
719
- best_candidate_this_round, best_cand_score_avg, best_cand_trials = (
720
- prompt_scores[0]
721
- )
722
-
723
- logger.info(
724
- f"\nBest candidate from this round (avg score {metric_config.metric.name}): {best_cand_score_avg:.4f}"
725
- )
726
- logger.info(f"Prompt: {best_candidate_this_round}")
727
-
728
- # Re-evaluate the best candidate from the round using the full dataset (if n_samples is None)
729
- # or the specified n_samples subset for a more stable score comparison.
730
- # This uses use_full_dataset flag appropriately.
731
- if best_cand_score_avg > best_score:
732
- logger.info("Running final evaluation on best candidate...")
733
- final_score_best_cand = self.evaluate_prompt(
734
- optimization_id=optimization_id,
735
- dataset=dataset,
736
- metric_config=metric_config,
737
- task_config=task_config,
738
- prompt=best_candidate_this_round,
739
- experiment_config=experiment_config,
740
- n_samples=n_samples,
741
- use_full_dataset=n_samples is None,
742
- verbose=self.verbose,
743
- )
744
- logger.info(
745
- f"Final evaluation score for best candidate: {final_score_best_cand:.4f}"
746
- )
747
-
748
- if final_score_best_cand > best_score:
749
- logger.info(f"New best prompt found!")
750
- best_score = final_score_best_cand
751
- best_prompt = best_candidate_this_round
752
- logger.info(f"New Best Prompt: {best_prompt}")
753
- logger.info(
754
- f"New Best Score ({metric_config.metric.name}): {best_score:.4f}"
755
- )
756
- else:
757
- logger.info(
758
- "Best candidate score did not improve upon final evaluation."
759
- )
760
- # Decide what prompt to carry to the next round's generation step.
761
- # Option 1: Carry the best scoring prompt overall (best_prompt)
762
- # Option 2: Carry the best candidate from this round (best_candidate_this_round) even if it didn't beat the overall best after final eval.
763
- # Let's stick with Option 1 for now - always generate from the overall best.
764
- # current_prompt = best_prompt # Implicitly done as best_prompt is updated
765
-
766
- improvement = self._calculate_improvement(best_score, previous_best_score)
767
- logger.info(
768
- f"Improvement in score ({metric_config.metric.name}) this round: {improvement:.2%}"
769
- )
770
-
771
- # Create round data
772
- round_data = self._create_round_data(
773
- round_num,
774
- best_prompt,
775
- best_score,
776
- best_prompt,
777
- prompt_scores,
778
- previous_best_score,
779
- improvement,
780
- )
781
- rounds.append(round_data)
782
- self._add_to_history(round_data.dict())
783
-
784
- if (
785
- improvement < self.improvement_threshold and round_num > 0
786
- ): # Avoid stopping after first round if threshold is low
787
- logger.info(
788
- f"Improvement below threshold ({improvement:.2%} < {self.improvement_threshold:.2%}), stopping early"
789
- )
790
- stopped_early = True
791
- break
792
-
793
- # Update TQDM postfix
794
- pbar.set_postfix(
795
- {
796
- "best_score": f"{best_score:.4f}",
797
- "improvement": f"{improvement:.2%}",
798
- "llm_calls": self.llm_call_counter,
799
- }
800
- )
801
- pbar.update(1)
802
-
803
- pbar.close()
804
-
805
- logger.info("\n" + "=" * 80)
806
- logger.info("OPTIMIZATION COMPLETE")
807
- logger.info("=" * 80)
808
- logger.info(f"Initial score: {initial_score:.4f}")
809
- logger.info(f"Final best score: {best_score:.4f}")
810
- if initial_score != 0: # Avoid division by zero if initial score was 0
811
- total_improvement_pct = (best_score - initial_score) / abs(
812
- initial_score
813
- ) # Use abs for safety
814
- logger.info(f"Total improvement: {total_improvement_pct:.2%}")
815
- elif best_score > 0:
816
- logger.info("Total improvement: infinite (initial score was 0)")
817
- else:
818
- logger.info("Total improvement: 0.00% (scores did not improve from 0)")
819
- logger.info("\nFINAL OPTIMIZED PROMPT:")
820
- logger.info("-" * 80)
821
- logger.info(best_prompt)
822
- logger.info("-" * 80)
823
- logger.info("=" * 80)
824
-
825
- return self._create_result(
826
- metric_config,
827
- task_config,
828
- best_prompt,
829
- best_score,
830
- initial_score,
831
- rounds,
832
- stopped_early,
833
- )
834
-
835
- def _calculate_improvement(
836
- self, current_score: float, previous_score: float
837
- ) -> float:
838
- """Calculate the improvement percentage between scores."""
839
- return (
840
- (current_score - previous_score) / previous_score
841
- if previous_score > 0
842
- else 0
843
- )
844
-
845
- def _create_round_data(
846
- self,
847
- round_num: int,
848
- current_best_prompt: str,
849
- current_best_score: float,
850
- best_prompt_overall: str,
851
- evaluated_candidates: List[tuple[str, float, List[float]]],
852
- previous_best_score: float,
853
- improvement_this_round: float,
854
- ) -> OptimizationRound:
855
- """Create an OptimizationRound object with the current round's data."""
856
- generated_prompts_log = []
857
- for prompt, avg_score, trial_scores in evaluated_candidates:
858
- improvement_vs_prev = self._calculate_improvement(
859
- avg_score, previous_best_score
860
- )
861
- generated_prompts_log.append(
862
- {
863
- "prompt": prompt,
864
- "score": avg_score,
865
- "trial_scores": trial_scores,
866
- "improvement": improvement_vs_prev,
867
- }
868
- )
869
-
870
- return OptimizationRound(
871
- round_number=round_num + 1,
872
- current_prompt=current_best_prompt,
873
- current_score=current_best_score,
874
- generated_prompts=generated_prompts_log,
875
- best_prompt=best_prompt_overall,
876
- best_score=current_best_score,
877
- improvement=improvement_this_round,
878
- )
879
-
880
- def _create_result(
881
- self,
882
- metric_config: MetricConfig,
883
- task_config: TaskConfig,
884
- best_prompt: str,
885
- best_score: float,
886
- initial_score: float,
887
- rounds: List[OptimizationRound],
888
- stopped_early: bool,
889
- ) -> OptimizationResult:
890
- """Create the final OptimizationResult object."""
891
- details = {
892
- "prompt_type": "chat" if task_config.use_chat_prompt else "non-chat",
893
- "initial_prompt": task_config.instruction_prompt,
894
- "initial_score": initial_score,
895
- "final_prompt": best_prompt,
896
- "final_score": best_score,
897
- "rounds": rounds,
898
- "total_rounds": len(rounds),
899
- "stopped_early": stopped_early,
900
- "metric_config": metric_config.dict(),
901
- "task_config": task_config.dict(),
902
- "model": self.model,
903
- "temperature": self.model_kwargs.get("temperature"),
904
- }
905
-
906
- return OptimizationResult(
907
- optimizer=self.__class__.__name__,
908
- prompt=best_prompt,
909
- score=best_score,
910
- metric_name=metric_config.metric.name,
911
- details=details,
912
- llm_calls=self.llm_call_counter
913
- )
914
-
915
- def _get_task_context(self, metric_config: MetricConfig) -> str:
916
- """Get task-specific context from the dataset and metric configuration."""
917
- if self.dataset is None or self.task_config is None:
918
- return ""
919
-
920
- input_fields = self.task_config.input_dataset_fields
921
- output_field = self.task_config.output_dataset_field
922
-
923
- # Describe Single Metric
924
- metric_name = metric_config.metric.name
925
- description = getattr(
926
- metric_config.metric, "description", "No description available."
927
- )
928
- goal = (
929
- "higher is better"
930
- if getattr(metric_config.metric, "higher_is_better", True)
931
- else "lower is better"
932
- )
933
- metrics_str = f"- {metric_name}: {description} ({goal})"
934
-
935
- context = "\nTask Context:\n"
936
- context += f"Input fields: {', '.join(input_fields)}\n"
937
- context += f"Output field: {output_field}\n"
938
- context += f"Evaluation Metric:\n{metrics_str}\n"
939
-
940
- try:
941
- # Try get_items() first as it's the preferred method
942
- items = self.dataset.get_items()
943
- if items:
944
- sample = items[0] # Get first sample
945
- else:
946
- # Fallback to other methods if get_items() fails or returns empty
947
- if hasattr(self.dataset, "samples") and self.dataset.samples:
948
- sample = self.dataset.samples[0] # Get first sample
949
- elif hasattr(self.dataset, "__iter__"):
950
- sample = next(iter(self.dataset))
951
- else:
952
- logger.warning(
953
- "Dataset does not have a samples attribute or is not iterable"
954
- )
955
- return context
956
-
957
- if sample is not None:
958
- context += "\nExample:\n"
959
- for field in input_fields:
960
- if field in sample:
961
- context += f"Input '{field}': {sample[field]}\n"
962
- if output_field in sample:
963
- context += f"Output '{output_field}': {sample[output_field]}\n"
964
- except Exception as e:
965
- logger.warning(f"Could not get sample from dataset: {e}")
966
-
967
- return context
968
-
969
- def _generate_candidate_prompts(
970
- self,
971
- current_prompt: str,
972
- best_score: float,
973
- round_num: int,
974
- previous_rounds: List[OptimizationRound],
975
- metric_config: MetricConfig,
976
- optimization_id: Optional[str] = None,
977
- ) -> List[str]:
978
- """Generate candidate prompts using meta-prompting."""
979
-
980
- logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
981
- logger.debug(f"Generating from prompt: {current_prompt}")
982
- logger.debug(f"Current best score: {best_score:.4f}")
983
-
984
- # Pass single metric_config
985
- history_context = self._build_history_context(previous_rounds)
986
- task_context_str = ""
987
- analysis_instruction = ""
988
- metric_focus_instruction = ""
989
- improvement_point_1 = ""
990
-
991
- if self.enable_context:
992
- task_context_str = self._get_task_context(metric_config=metric_config)
993
- analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
994
- metric_focus_instruction = f"Focus on improving the score for the metric: {metric_config.metric.name}."
995
- improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
996
- logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
997
- else:
998
- analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
999
- metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
1000
- improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
1001
- logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
1002
-
1003
- user_prompt = f"""Current prompt: {current_prompt}
1004
- Current score: {best_score}
1005
- {history_context}
1006
- {task_context_str}
1007
-
1008
- {analysis_instruction}
1009
- Generate {self.num_prompts_per_round} improved versions of this prompt.
1010
- {metric_focus_instruction}
1011
- Each version should aim to:
1012
- {improvement_point_1}
1013
- 2. Provide necessary context and constraints (if applicable, without relying on disabled external context).
1014
- 3. Guide the model to produce the desired output format suitable for the task.
1015
- 4. Remove ambiguity and unnecessary elements.
1016
- 5. Maintain conciseness while being complete.
1017
-
1018
- Return a valid JSON array as specified."""
1019
-
1020
- try:
1021
- # Use _call_model which handles selecting reasoning_model
1022
- content = self._call_model(
1023
- prompt=user_prompt,
1024
- system_prompt=self._REASONING_SYSTEM_PROMPT,
1025
- is_reasoning=True,
1026
- optimization_id=optimization_id,
1027
- )
1028
- logger.debug(f"Raw response from reasoning model: {content}")
1029
-
1030
- # --- Robust JSON Parsing and Validation ---
1031
- json_result = None
1032
- try:
1033
- # Try direct JSON parsing
1034
- json_result = json.loads(content)
1035
- except json.JSONDecodeError:
1036
- # If direct fails, try regex extraction
1037
- logger.warning(
1038
- "Direct JSON parsing failed, attempting regex extraction."
1039
- )
1040
- import re
1041
-
1042
- json_match = re.search(r"\{.*\}", content, re.DOTALL)
1043
- if json_match:
1044
- try:
1045
- json_result = json.loads(json_match.group())
1046
- except json.JSONDecodeError as e:
1047
- logger.error(f"Could not parse JSON extracted via regex: {e}")
1048
- return [current_prompt] # Fallback
1049
- else:
1050
- logger.error("No JSON object found in response via regex.")
1051
- return [current_prompt] # Fallback
1052
-
1053
- # Validate the parsed JSON structure
1054
- if not isinstance(json_result, dict) or "prompts" not in json_result:
1055
- logger.error(
1056
- "Parsed JSON is not a dictionary or missing 'prompts' key."
1057
- )
1058
- logger.debug(f"Parsed JSON content: {json_result}")
1059
- return [current_prompt] # Fallback
1060
-
1061
- if not isinstance(json_result["prompts"], list):
1062
- logger.error("'prompts' key does not contain a list.")
1063
- logger.debug(f"Content of 'prompts': {json_result.get('prompts')}")
1064
- return [current_prompt] # Fallback
1065
-
1066
- # Extract and log valid prompts
1067
- valid_prompts = []
1068
- for item in json_result["prompts"]:
1069
- if (
1070
- isinstance(item, dict)
1071
- and "prompt" in item
1072
- and isinstance(item["prompt"], str)
1073
- ):
1074
- prompt_text = item["prompt"]
1075
- valid_prompts.append(prompt_text)
1076
- # Log details
1077
- focus = item.get("improvement_focus", "N/A")
1078
- reasoning = item.get("reasoning", "N/A")
1079
- logger.info(f"Generated prompt: {prompt_text}")
1080
- logger.info(f" Improvement focus: {focus}")
1081
- logger.info(f" Reasoning: {reasoning}")
1082
- else:
1083
- logger.warning(
1084
- f"Skipping invalid prompt item structure in JSON response: {item}"
1085
- )
1086
-
1087
- if not valid_prompts:
1088
- logger.warning(
1089
- "No valid prompts found in the parsed JSON response after validation."
1090
- )
1091
- return [current_prompt] # Fallback
1092
-
1093
- return valid_prompts
1094
- # --- End Robust Parsing ---
1095
-
1096
- except Exception as e:
1097
- # Catch other errors during model call or processing
1098
- logger.error(f"Unexpected error during candidate prompt generation: {e}")
1099
- logger.error("Falling back to current prompt.")
1100
- return [current_prompt]
1101
-
1102
- def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
1103
- """Build context from previous optimization rounds."""
1104
- if not previous_rounds:
1105
- return ""
1106
-
1107
- context = "\nPrevious rounds (latest first):\n"
1108
- for round_data in reversed(previous_rounds[-3:]):
1109
- context += f"\nRound {round_data.round_number}:\n"
1110
- context += f"Best score this round: {round_data.best_score:.4f}\n"
1111
- context += "Generated prompts this round (best first):\n"
1112
-
1113
- sorted_generated = sorted(
1114
- round_data.generated_prompts,
1115
- key=lambda p: p.get("score", -float("inf")),
1116
- reverse=True,
1117
- )
1118
-
1119
- for p in sorted_generated[:3]:
1120
- prompt_text = p.get("prompt", "N/A")
1121
- score = p.get("score", float("nan"))
1122
- context += f"- Prompt: {prompt_text[:150]}...\n"
1123
- context += f" Avg Score: {score:.4f}\n"
1124
- return context
1125
-
1126
- def _get_evaluation_subset(
1127
- self, dataset: opik.Dataset, min_size: int = 20, max_size: int = 100
1128
- ) -> List[Dict[str, Any]]:
1129
- """Get a random subset of the dataset for evaluation.
1130
-
1131
- Returns:
1132
- List[Dict[str, Any]]: A list of dataset items to evaluate against
1133
- """
1134
- try:
1135
- # Get all items from the dataset
1136
- all_items = dataset.get_items()
1137
- if not all_items:
1138
- return all_items
1139
-
1140
- # Calculate subset size
1141
- total_size = len(all_items)
1142
- subset_size = min(max(min_size, int(total_size * 0.2)), max_size)
1143
-
1144
- # Get random subset of items
1145
- import random
1146
-
1147
- return random.sample(all_items, subset_size)
1148
-
1149
- except Exception as e:
1150
- logger.warning(f"Could not create evaluation subset: {e}")
1151
- return all_items