opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +15 -26
- opik_optimizer/base_optimizer.py +28 -44
- opik_optimizer/data/hotpot-500.json +501 -1001
- opik_optimizer/datasets/__init__.py +6 -7
- opik_optimizer/datasets/hotpot_qa.py +2 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
- opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
- opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
- opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
- opik_optimizer/mipro_optimizer/__init__.py +1 -1
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
- opik_optimizer/mipro_optimizer/utils.py +1 -23
- opik_optimizer/optimization_config/chat_prompt.py +106 -0
- opik_optimizer/optimization_config/configs.py +2 -21
- opik_optimizer/optimization_config/mappers.py +1 -1
- opik_optimizer/optimization_result.py +57 -85
- opik_optimizer/reporting_utils.py +180 -0
- opik_optimizer/task_evaluator.py +41 -26
- opik_optimizer/utils.py +187 -3
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
- opik_optimizer-0.9.0.dist-info/RECORD +48 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
- opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
- opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
- opik_optimizer/integrations/__init__.py +0 -0
- opik_optimizer/meta_prompt_optimizer.py +0 -1151
- opik_optimizer-0.8.0.dist-info/RECORD +0 -45
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,816 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, overload
|
5
|
+
|
6
|
+
import litellm
|
7
|
+
import opik
|
8
|
+
from litellm.caching import Cache
|
9
|
+
from litellm.types.caching import LiteLLMCacheType
|
10
|
+
from opik import Dataset
|
11
|
+
from opik.api_objects import opik_client
|
12
|
+
from opik.environment import get_tqdm_for_current_environment
|
13
|
+
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
14
|
+
|
15
|
+
from opik_optimizer import task_evaluator
|
16
|
+
|
17
|
+
from .. import _throttle
|
18
|
+
from ..base_optimizer import BaseOptimizer, OptimizationRound
|
19
|
+
from ..optimization_config import chat_prompt, mappers
|
20
|
+
from ..optimization_result import OptimizationResult
|
21
|
+
from . import reporting
|
22
|
+
|
23
|
+
tqdm = get_tqdm_for_current_environment()
|
24
|
+
|
25
|
+
# Using disk cache for LLM calls
|
26
|
+
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
27
|
+
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
|
28
|
+
|
29
|
+
# Set up logging
|
30
|
+
logger = logging.getLogger(__name__) # Gets logger configured by setup_logging
|
31
|
+
|
32
|
+
_rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
33
|
+
|
34
|
+
|
35
|
+
class MetaPromptOptimizer(BaseOptimizer):
|
36
|
+
"""
|
37
|
+
The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
|
38
|
+
|
39
|
+
This algorithm is best used when you have a prompt and would like to make sure it follows best
|
40
|
+
practices.
|
41
|
+
"""
|
42
|
+
# --- Constants for Default Configuration ---
|
43
|
+
DEFAULT_ROUNDS = 3
|
44
|
+
DEFAULT_PROMPTS_PER_ROUND = 4
|
45
|
+
|
46
|
+
# --- Reasoning System Prompt ---
|
47
|
+
_REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
|
48
|
+
|
49
|
+
Focus on making the prompt more effective by:
|
50
|
+
1. Being clear and specific about what is expected
|
51
|
+
2. Providing necessary context and constraints
|
52
|
+
3. Guiding the model to produce the desired output format
|
53
|
+
4. Removing ambiguity and unnecessary elements
|
54
|
+
5. Maintaining conciseness while being complete
|
55
|
+
|
56
|
+
Instructions:
|
57
|
+
1. If there is a system prompt, prioritize adding instructions there if and only if it makes sense.
|
58
|
+
2. DO NOT add any variables or parameters to the prompt you are editing.
|
59
|
+
3. You can reuse variables that already exist in the prompt.
|
60
|
+
|
61
|
+
Return a JSON array of prompts with the following structure. Make sure to return a valid
|
62
|
+
JSON object with correct use of double quotes and single quotes. JSON keys should be
|
63
|
+
double-quoted:
|
64
|
+
{
|
65
|
+
"prompts": [
|
66
|
+
{
|
67
|
+
"prompt": [{"role": "<role>", "content": "<content>"}],
|
68
|
+
"improvement_focus": "what aspect this prompt improves",
|
69
|
+
"reasoning": "why this improvement should help"
|
70
|
+
},
|
71
|
+
{
|
72
|
+
"prompt": [{"role": "<role>", "content": "<content>"}],
|
73
|
+
"improvement_focus": "what aspect this prompt improves",
|
74
|
+
"reasoning": "why this improvement should help"
|
75
|
+
}
|
76
|
+
]
|
77
|
+
}"""
|
78
|
+
|
79
|
+
def __init__(
|
80
|
+
self,
|
81
|
+
model: str,
|
82
|
+
reasoning_model: Optional[str] = None,
|
83
|
+
rounds: int = DEFAULT_ROUNDS,
|
84
|
+
num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
|
85
|
+
num_threads: int = 12,
|
86
|
+
project_name: str = "Optimization",
|
87
|
+
verbose: int = 1,
|
88
|
+
enable_context: bool = True,
|
89
|
+
**model_kwargs,
|
90
|
+
):
|
91
|
+
"""
|
92
|
+
Args:
|
93
|
+
model: The model to use for evaluation
|
94
|
+
reasoning_model: The model to use for reasoning and prompt generation
|
95
|
+
rounds: Number of optimization rounds
|
96
|
+
num_prompts_per_round: Number of prompts to generate per round
|
97
|
+
num_threads: Number of threads for parallel evaluation
|
98
|
+
project_name: Optional project name for tracking
|
99
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
100
|
+
enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
|
101
|
+
**model_kwargs: Additional model parameters
|
102
|
+
"""
|
103
|
+
super().__init__(model=model, project_name=project_name, **model_kwargs)
|
104
|
+
self.reasoning_model = reasoning_model if reasoning_model is not None else model
|
105
|
+
self.rounds = rounds
|
106
|
+
self.num_prompts_per_round = num_prompts_per_round
|
107
|
+
self.num_threads = num_threads
|
108
|
+
self.verbose = verbose
|
109
|
+
self.dataset = None
|
110
|
+
self._opik_client = opik_client.get_client_cached()
|
111
|
+
self.llm_call_counter = 0
|
112
|
+
self.enable_context = enable_context
|
113
|
+
logger.debug(
|
114
|
+
f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
|
115
|
+
)
|
116
|
+
logger.debug(
|
117
|
+
f"Optimization rounds: {rounds}, Prompts/round: {num_prompts_per_round}"
|
118
|
+
)
|
119
|
+
|
120
|
+
@_throttle.rate_limited(_rate_limiter)
|
121
|
+
def _call_model(
|
122
|
+
self,
|
123
|
+
messages: List[Dict[str, str]],
|
124
|
+
is_reasoning: bool = False,
|
125
|
+
optimization_id: Optional[str] = None,
|
126
|
+
) -> str:
|
127
|
+
"""Call the model with the given prompt and return the response."""
|
128
|
+
self.llm_call_counter += 1
|
129
|
+
# Note: Basic retry logic could be added here using tenacity
|
130
|
+
try:
|
131
|
+
# Basic LLM parameters (e.g., temperature, max_tokens)
|
132
|
+
base_temperature = getattr(self, "temperature", 0.3)
|
133
|
+
base_max_tokens = getattr(self, "max_tokens", 1000)
|
134
|
+
|
135
|
+
# Use potentially different settings for reasoning calls
|
136
|
+
reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
|
137
|
+
# Increase max_tokens for reasoning to ensure JSON fits, unless already high
|
138
|
+
reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
|
139
|
+
|
140
|
+
llm_config_params = {
|
141
|
+
"temperature": reasoning_temperature if is_reasoning else base_temperature,
|
142
|
+
"max_tokens": reasoning_max_tokens,
|
143
|
+
"top_p": getattr(self, "top_p", 1.0),
|
144
|
+
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
145
|
+
"presence_penalty": getattr(self, "presence_penalty", 0.0),
|
146
|
+
}
|
147
|
+
|
148
|
+
# Prepare metadata that we want to be part of the LLM call context.
|
149
|
+
metadata_for_opik = {}
|
150
|
+
if self.project_name:
|
151
|
+
metadata_for_opik["project_name"] = (
|
152
|
+
self.project_name
|
153
|
+
) # Top-level for general use
|
154
|
+
metadata_for_opik["opik"] = {"project_name": self.project_name}
|
155
|
+
|
156
|
+
if optimization_id:
|
157
|
+
# Also add to opik-specific structure if project_name was added
|
158
|
+
if "opik" in metadata_for_opik:
|
159
|
+
metadata_for_opik["opik"]["optimization_id"] = optimization_id
|
160
|
+
|
161
|
+
metadata_for_opik["optimizer_name"] = self.__class__.__name__
|
162
|
+
metadata_for_opik["opik_call_type"] = (
|
163
|
+
"reasoning" if is_reasoning else "evaluation_llm_task_direct"
|
164
|
+
)
|
165
|
+
|
166
|
+
if metadata_for_opik:
|
167
|
+
llm_config_params["metadata"] = metadata_for_opik
|
168
|
+
|
169
|
+
model_to_use = self.reasoning_model if is_reasoning else self.model
|
170
|
+
|
171
|
+
# Pass llm_config_params (which now includes our metadata) to the Opik monitor.
|
172
|
+
# The monitor is expected to return a dictionary suitable for spreading into litellm.completion,
|
173
|
+
# having handled our metadata and added any Opik-specific configurations.
|
174
|
+
final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
|
175
|
+
llm_config_params.copy()
|
176
|
+
)
|
177
|
+
|
178
|
+
logger.debug(
|
179
|
+
f"Calling model '{model_to_use}' with messages: {messages}, "
|
180
|
+
f"final params for litellm (from monitor): {final_call_params}"
|
181
|
+
)
|
182
|
+
|
183
|
+
response = litellm.completion(
|
184
|
+
model=model_to_use,
|
185
|
+
messages=messages,
|
186
|
+
num_retries=6,
|
187
|
+
**final_call_params
|
188
|
+
)
|
189
|
+
return response.choices[0].message.content
|
190
|
+
except litellm.exceptions.RateLimitError as e:
|
191
|
+
logger.error(f"LiteLLM Rate Limit Error: {e}")
|
192
|
+
raise
|
193
|
+
except litellm.exceptions.APIConnectionError as e:
|
194
|
+
logger.error(f"LiteLLM API Connection Error: {e}")
|
195
|
+
raise
|
196
|
+
except litellm.exceptions.ContextWindowExceededError as e:
|
197
|
+
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
198
|
+
# Log prompt length if possible? Needs access to prompt_for_llm here.
|
199
|
+
raise
|
200
|
+
except Exception as e:
|
201
|
+
logger.error(
|
202
|
+
f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
|
203
|
+
)
|
204
|
+
raise
|
205
|
+
|
206
|
+
# type: ignore
|
207
|
+
def evaluate_prompt(
|
208
|
+
self,
|
209
|
+
prompt: chat_prompt.ChatPrompt,
|
210
|
+
dataset: opik.Dataset,
|
211
|
+
metric: Callable,
|
212
|
+
use_full_dataset: bool = True,
|
213
|
+
experiment_config: Optional[Dict] = None,
|
214
|
+
n_samples: Optional[int] = None,
|
215
|
+
optimization_id: Optional[str] = None,
|
216
|
+
verbose: int = 1,
|
217
|
+
) -> float:
|
218
|
+
"""
|
219
|
+
Args:
|
220
|
+
prompt: The prompt to evaluate
|
221
|
+
dataset: Opik Dataset to evaluate the prompt on
|
222
|
+
metric: Metric functions
|
223
|
+
use_full_dataset: Whether to use the full dataset or a subset
|
224
|
+
experiment_config: Optional configuration for the experiment, useful to log additional metadata
|
225
|
+
n_samples: Optional number of items to test in the dataset
|
226
|
+
optimization_id: Optional ID of the optimization
|
227
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
228
|
+
Returns:
|
229
|
+
float: The evaluation score
|
230
|
+
"""
|
231
|
+
# Calculate subset size for trials
|
232
|
+
if not use_full_dataset:
|
233
|
+
total_items = len(dataset.get_items())
|
234
|
+
if n_samples is not None:
|
235
|
+
if n_samples > total_items:
|
236
|
+
logger.warning(
|
237
|
+
f"Requested n_samples ({n_samples}) is larger than dataset size ({total_items}). Using full dataset."
|
238
|
+
)
|
239
|
+
subset_size = None
|
240
|
+
else:
|
241
|
+
subset_size = n_samples
|
242
|
+
logger.debug(f"Using specified n_samples: {subset_size} items")
|
243
|
+
else:
|
244
|
+
# Calculate 20% of total, but no more than 20 items and no more than total items
|
245
|
+
subset_size = min(total_items, min(20, max(10, int(total_items * 0.2))))
|
246
|
+
logger.debug(
|
247
|
+
f"Using automatic subset size calculation: {subset_size} items (20% of {total_items} total items)"
|
248
|
+
)
|
249
|
+
else:
|
250
|
+
subset_size = None # Use all items for final checks
|
251
|
+
logger.debug("Using full dataset for evaluation")
|
252
|
+
|
253
|
+
experiment_config = experiment_config or {}
|
254
|
+
experiment_config = {
|
255
|
+
**experiment_config,
|
256
|
+
**{
|
257
|
+
"optimizer": self.__class__.__name__,
|
258
|
+
"metric": metric.__name__,
|
259
|
+
"dataset": dataset.name,
|
260
|
+
"configuration": {
|
261
|
+
"prompt": prompt.formatted_messages,
|
262
|
+
"n_samples": subset_size,
|
263
|
+
"use_full_dataset": use_full_dataset,
|
264
|
+
},
|
265
|
+
},
|
266
|
+
}
|
267
|
+
if optimization_id:
|
268
|
+
experiment_config["optimization_id"] = optimization_id
|
269
|
+
|
270
|
+
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
|
271
|
+
# --- Step 1: Prepare the prompt for the LLM ---
|
272
|
+
messages = [{
|
273
|
+
"role": item["role"],
|
274
|
+
"content": item["content"].format(**dataset_item)
|
275
|
+
} for item in prompt.formatted_messages]
|
276
|
+
|
277
|
+
# --- Step 2: Call the model ---
|
278
|
+
try:
|
279
|
+
logger.debug(f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}")
|
280
|
+
raw_model_output = self._call_model(
|
281
|
+
messages=messages,
|
282
|
+
is_reasoning=False,
|
283
|
+
optimization_id=optimization_id,
|
284
|
+
)
|
285
|
+
logger.debug(f"LLM raw response length: {len(raw_model_output)}")
|
286
|
+
logger.debug(f"LLM raw output: {raw_model_output}")
|
287
|
+
except Exception as e:
|
288
|
+
logger.error(f"Error calling model with prompt: {e}")
|
289
|
+
logger.error(f"Failed prompt: {messages}")
|
290
|
+
logger.error(f"Prompt length: {sum(len(msg['content']) for msg in messages)}")
|
291
|
+
raise
|
292
|
+
|
293
|
+
# --- Step 3: Clean the model's output before metric evaluation ---
|
294
|
+
cleaned_model_output = raw_model_output.strip()
|
295
|
+
|
296
|
+
result = {
|
297
|
+
mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
|
298
|
+
}
|
299
|
+
return result
|
300
|
+
|
301
|
+
# Use dataset's get_items with limit for sampling
|
302
|
+
logger.debug(
|
303
|
+
f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {metric.__name__}"
|
304
|
+
)
|
305
|
+
score = task_evaluator.evaluate(
|
306
|
+
dataset=dataset,
|
307
|
+
metric=metric,
|
308
|
+
evaluated_task=llm_task,
|
309
|
+
num_threads=self.num_threads,
|
310
|
+
project_name=self.project_name,
|
311
|
+
n_samples=subset_size, # Use subset_size for trials, None for full dataset
|
312
|
+
experiment_config=experiment_config,
|
313
|
+
optimization_id=optimization_id,
|
314
|
+
verbose=self.verbose,
|
315
|
+
)
|
316
|
+
logger.debug(f"Evaluation score: {score:.4f}")
|
317
|
+
return score
|
318
|
+
|
319
|
+
def optimize_prompt( # type: ignore[override]
|
320
|
+
self,
|
321
|
+
prompt: chat_prompt.ChatPrompt,
|
322
|
+
dataset: Dataset,
|
323
|
+
metric: Callable,
|
324
|
+
experiment_config: Optional[Dict] = None,
|
325
|
+
n_samples: Optional[int] = None,
|
326
|
+
auto_continue: bool = False,
|
327
|
+
**kwargs,
|
328
|
+
) -> OptimizationResult:
|
329
|
+
"""
|
330
|
+
Optimize a prompt using meta-reasoning.
|
331
|
+
|
332
|
+
Args:
|
333
|
+
prompt: The prompt to optimize
|
334
|
+
dataset: The dataset to evaluate against
|
335
|
+
metric: The metric to use for evaluation
|
336
|
+
experiment_config: A dictionary to log with the experiments
|
337
|
+
n_samples: The number of dataset items to use for evaluation
|
338
|
+
auto_continue: If True, the algorithm may continue if goal not met
|
339
|
+
**kwargs: Additional arguments for evaluation
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
OptimizationResult: Structured result containing optimization details
|
343
|
+
"""
|
344
|
+
reporting.display_header(self.__class__.__name__, verbose=self.verbose)
|
345
|
+
|
346
|
+
total_items = len(dataset.get_items())
|
347
|
+
if n_samples is not None and n_samples > total_items:
|
348
|
+
logger.warning(
|
349
|
+
f"Requested n_samples ({n_samples}) is larger than dataset size ({total_items}). Using full dataset."
|
350
|
+
)
|
351
|
+
n_samples = None
|
352
|
+
|
353
|
+
reporting.display_configuration(
|
354
|
+
messages=prompt.formatted_messages,
|
355
|
+
optimizer_config={
|
356
|
+
"optimizer": self.__class__.__name__,
|
357
|
+
"n_samples": n_samples,
|
358
|
+
"auto_continue": auto_continue
|
359
|
+
},
|
360
|
+
verbose=self.verbose
|
361
|
+
)
|
362
|
+
|
363
|
+
optimization = None
|
364
|
+
try:
|
365
|
+
optimization = self._opik_client.create_optimization(
|
366
|
+
dataset_name=dataset.name,
|
367
|
+
objective_name=metric.__name__,
|
368
|
+
metadata={"optimizer": self.__class__.__name__},
|
369
|
+
)
|
370
|
+
logger.debug(f"Created optimization with ID: {optimization.id}")
|
371
|
+
except Exception as e:
|
372
|
+
logger.warning(
|
373
|
+
f"Opik server does not support optimizations: {e}. Please upgrade opik."
|
374
|
+
)
|
375
|
+
optimization = None
|
376
|
+
|
377
|
+
try:
|
378
|
+
result = self._optimize_prompt(
|
379
|
+
optimization_id=optimization.id if optimization is not None else None,
|
380
|
+
prompt=prompt,
|
381
|
+
dataset=dataset,
|
382
|
+
metric=metric,
|
383
|
+
experiment_config=experiment_config,
|
384
|
+
n_samples=n_samples,
|
385
|
+
auto_continue=auto_continue,
|
386
|
+
**kwargs,
|
387
|
+
)
|
388
|
+
if optimization:
|
389
|
+
self.update_optimization(optimization, status="completed")
|
390
|
+
logger.debug("Optimization completed successfully")
|
391
|
+
return result
|
392
|
+
except Exception as e:
|
393
|
+
logger.error(f"Optimization failed: {e}")
|
394
|
+
if optimization:
|
395
|
+
self.update_optimization(optimization, status="cancelled")
|
396
|
+
logger.debug("Optimization marked as cancelled")
|
397
|
+
raise e
|
398
|
+
|
399
|
+
def _optimize_prompt(
|
400
|
+
self,
|
401
|
+
optimization_id: str,
|
402
|
+
prompt: chat_prompt.ChatPrompt,
|
403
|
+
dataset: Dataset,
|
404
|
+
metric: Callable,
|
405
|
+
experiment_config: Optional[Dict],
|
406
|
+
n_samples: int,
|
407
|
+
auto_continue: bool,
|
408
|
+
**kwargs,
|
409
|
+
) -> OptimizationResult:
|
410
|
+
self.auto_continue = auto_continue
|
411
|
+
self.dataset = dataset
|
412
|
+
self.prompt = prompt
|
413
|
+
self.llm_call_counter = 0 # Reset counter for run
|
414
|
+
|
415
|
+
current_prompt = prompt.formatted_messages
|
416
|
+
experiment_config = experiment_config or {}
|
417
|
+
experiment_config = {
|
418
|
+
**experiment_config,
|
419
|
+
**{
|
420
|
+
"optimizer": self.__class__.__name__,
|
421
|
+
"metric": metric.__name__,
|
422
|
+
"dataset": self.dataset.name,
|
423
|
+
"configuration": {
|
424
|
+
"prompt": current_prompt,
|
425
|
+
"rounds": self.rounds,
|
426
|
+
"num_prompts_per_round": self.num_prompts_per_round,
|
427
|
+
},
|
428
|
+
},
|
429
|
+
}
|
430
|
+
|
431
|
+
with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
|
432
|
+
initial_score = self.evaluate_prompt(
|
433
|
+
prompt=prompt,
|
434
|
+
optimization_id=optimization_id,
|
435
|
+
dataset=dataset,
|
436
|
+
metric=metric,
|
437
|
+
n_samples=n_samples,
|
438
|
+
experiment_config=experiment_config,
|
439
|
+
use_full_dataset=n_samples is None,
|
440
|
+
verbose=self.verbose,
|
441
|
+
)
|
442
|
+
best_score = initial_score
|
443
|
+
best_prompt = current_prompt
|
444
|
+
rounds = []
|
445
|
+
|
446
|
+
baseline_reporter.set_score(initial_score)
|
447
|
+
|
448
|
+
reporting.display_optimization_start_message(verbose=self.verbose)
|
449
|
+
with reporting.display_round_progress(self.rounds, verbose=self.verbose) as round_reporter:
|
450
|
+
for round_num in range(self.rounds):
|
451
|
+
|
452
|
+
round_reporter.round_start(round_num)
|
453
|
+
previous_best_score = best_score
|
454
|
+
|
455
|
+
# Step 1. Create a set of candidate prompts
|
456
|
+
try:
|
457
|
+
candidate_prompts = self._generate_candidate_prompts(
|
458
|
+
current_prompt=best_prompt,
|
459
|
+
best_score=best_score,
|
460
|
+
round_num=round_num,
|
461
|
+
previous_rounds=rounds,
|
462
|
+
metric=metric,
|
463
|
+
optimization_id=optimization_id,
|
464
|
+
)
|
465
|
+
except Exception as e:
|
466
|
+
round_reporter.failed_to_generate(self.num_prompts_per_round, e)
|
467
|
+
continue
|
468
|
+
|
469
|
+
# Step 2. Score each candidate prompt
|
470
|
+
prompt_scores = []
|
471
|
+
for candidate_count, prompt in enumerate(candidate_prompts):
|
472
|
+
with reporting.display_prompt_candidate_scoring_report(candidate_count, prompt, verbose=self.verbose) as eval_report:
|
473
|
+
eval_report.set_generated_prompts(candidate_count, prompt)
|
474
|
+
|
475
|
+
try:
|
476
|
+
prompt_score = self.evaluate_prompt(
|
477
|
+
prompt=chat_prompt.ChatPrompt(messages=prompt),
|
478
|
+
optimization_id=optimization_id,
|
479
|
+
dataset=dataset,
|
480
|
+
metric=metric,
|
481
|
+
n_samples=n_samples,
|
482
|
+
use_full_dataset=False,
|
483
|
+
experiment_config=experiment_config,
|
484
|
+
verbose=self.verbose,
|
485
|
+
)
|
486
|
+
|
487
|
+
eval_report.set_final_score(best_score, prompt_score)
|
488
|
+
except Exception as e:
|
489
|
+
raise ValueError(f"Error evaluating candidate prompt: {e}")
|
490
|
+
|
491
|
+
prompt_scores.append((prompt, prompt_score))
|
492
|
+
|
493
|
+
# Step 3. Identify potential improvements
|
494
|
+
if not prompt_scores:
|
495
|
+
logger.warning("No prompts were successfully evaluated in this round")
|
496
|
+
break
|
497
|
+
|
498
|
+
prompt_scores.sort(key=lambda x: x[1], reverse=True)
|
499
|
+
best_candidate_this_round, best_cand_score_avg = (
|
500
|
+
prompt_scores[0]
|
501
|
+
)
|
502
|
+
improvement = self._calculate_improvement(best_cand_score_avg, best_score)
|
503
|
+
round_reporter.round_end(round_num, best_cand_score_avg, best_score, best_prompt)
|
504
|
+
|
505
|
+
round_data = self._create_round_data(
|
506
|
+
round_num=round_num,
|
507
|
+
current_best_prompt=chat_prompt.ChatPrompt(messages=best_candidate_this_round),
|
508
|
+
current_best_score=best_cand_score_avg,
|
509
|
+
best_prompt_overall=chat_prompt.ChatPrompt(messages=best_prompt),
|
510
|
+
evaluated_candidates=prompt_scores,
|
511
|
+
previous_best_score=previous_best_score,
|
512
|
+
improvement_this_round=improvement,
|
513
|
+
)
|
514
|
+
rounds.append(round_data)
|
515
|
+
self._add_to_history(round_data.model_dump())
|
516
|
+
|
517
|
+
if improvement > 0:
|
518
|
+
best_score = best_cand_score_avg
|
519
|
+
best_prompt = best_candidate_this_round
|
520
|
+
|
521
|
+
reporting.display_result(
|
522
|
+
initial_score,
|
523
|
+
best_score,
|
524
|
+
best_prompt,
|
525
|
+
verbose=self.verbose
|
526
|
+
)
|
527
|
+
|
528
|
+
return self._create_result(
|
529
|
+
metric,
|
530
|
+
prompt,
|
531
|
+
best_prompt,
|
532
|
+
best_score,
|
533
|
+
initial_score,
|
534
|
+
rounds,
|
535
|
+
)
|
536
|
+
|
537
|
+
def _calculate_improvement(
|
538
|
+
self, current_score: float, previous_score: float
|
539
|
+
) -> float:
|
540
|
+
"""Calculate the improvement percentage between scores."""
|
541
|
+
return (
|
542
|
+
(current_score - previous_score) / previous_score
|
543
|
+
if previous_score > 0
|
544
|
+
else 0
|
545
|
+
)
|
546
|
+
|
547
|
+
def _create_round_data(
|
548
|
+
self,
|
549
|
+
round_num: int,
|
550
|
+
current_best_prompt: chat_prompt.ChatPrompt,
|
551
|
+
current_best_score: float,
|
552
|
+
best_prompt_overall: chat_prompt.ChatPrompt,
|
553
|
+
evaluated_candidates: List[tuple[str, float, List[float]]],
|
554
|
+
previous_best_score: float,
|
555
|
+
improvement_this_round: float,
|
556
|
+
) -> OptimizationRound:
|
557
|
+
"""Create an OptimizationRound object with the current round's data."""
|
558
|
+
generated_prompts_log = []
|
559
|
+
for prompt, score in evaluated_candidates:
|
560
|
+
improvement_vs_prev = self._calculate_improvement(
|
561
|
+
score, previous_best_score
|
562
|
+
)
|
563
|
+
generated_prompts_log.append(
|
564
|
+
{
|
565
|
+
"prompt": prompt,
|
566
|
+
"score": score,
|
567
|
+
"improvement": improvement_vs_prev,
|
568
|
+
}
|
569
|
+
)
|
570
|
+
|
571
|
+
return OptimizationRound(
|
572
|
+
round_number=round_num + 1,
|
573
|
+
current_prompt=current_best_prompt,
|
574
|
+
current_score=current_best_score,
|
575
|
+
generated_prompts=generated_prompts_log,
|
576
|
+
best_prompt=best_prompt_overall,
|
577
|
+
best_score=current_best_score,
|
578
|
+
improvement=improvement_this_round,
|
579
|
+
)
|
580
|
+
|
581
|
+
def _create_result(
|
582
|
+
self,
|
583
|
+
metric: Callable,
|
584
|
+
prompt: chat_prompt.ChatPrompt,
|
585
|
+
best_prompt: str,
|
586
|
+
best_score: float,
|
587
|
+
initial_score: float,
|
588
|
+
rounds: List[OptimizationRound],
|
589
|
+
) -> OptimizationResult:
|
590
|
+
"""Create the final OptimizationResult object."""
|
591
|
+
details = {
|
592
|
+
"initial_prompt": prompt,
|
593
|
+
"initial_score": initial_score,
|
594
|
+
"final_prompt": best_prompt,
|
595
|
+
"final_score": best_score,
|
596
|
+
"rounds": rounds,
|
597
|
+
"total_rounds": len(rounds),
|
598
|
+
"metric_name": metric.__name__,
|
599
|
+
"model": self.model,
|
600
|
+
"temperature": self.model_kwargs.get("temperature"),
|
601
|
+
}
|
602
|
+
|
603
|
+
return OptimizationResult(
|
604
|
+
optimizer=self.__class__.__name__,
|
605
|
+
prompt=best_prompt,
|
606
|
+
score=best_score,
|
607
|
+
metric_name=metric.__name__,
|
608
|
+
details=details,
|
609
|
+
llm_calls=self.llm_call_counter
|
610
|
+
)
|
611
|
+
|
612
|
+
def _get_task_context(self, metric: Callable) -> str:
|
613
|
+
"""Get task-specific context from the dataset and metric configuration."""
|
614
|
+
if self.dataset is None:
|
615
|
+
return ""
|
616
|
+
|
617
|
+
try:
|
618
|
+
# Try get_items() first as it's the preferred method
|
619
|
+
items = self.dataset.get_items()
|
620
|
+
sample = items[0] # Get first sample
|
621
|
+
except Exception as e:
|
622
|
+
logger.warning(f"Could not get sample from dataset: {e}")
|
623
|
+
|
624
|
+
# Describe Single Metric
|
625
|
+
if sample is not None:
|
626
|
+
metric_name = metric.__name__
|
627
|
+
description = metric.__doc__ or "No description available."
|
628
|
+
|
629
|
+
metrics_str = f"- {metric_name}: {description}"
|
630
|
+
|
631
|
+
context = "\nTask Context:\n"
|
632
|
+
context += f"Dataset fields (includes both input and optionally the expected output): {', '.join([x for x in sample.keys() if x != 'id'])}\n"
|
633
|
+
context += f"Evaluation Metric:\n{metrics_str}\n"
|
634
|
+
context += f"\nExample:\n{json.dumps(sample)}\n"
|
635
|
+
|
636
|
+
return context
|
637
|
+
|
638
|
+
def _generate_candidate_prompts(
|
639
|
+
self,
|
640
|
+
current_prompt: chat_prompt.ChatPrompt,
|
641
|
+
best_score: float,
|
642
|
+
round_num: int,
|
643
|
+
previous_rounds: List[OptimizationRound],
|
644
|
+
metric: Callable,
|
645
|
+
optimization_id: Optional[str] = None,
|
646
|
+
) -> List[str]:
|
647
|
+
"""Generate candidate prompts using meta-prompting."""
|
648
|
+
with reporting.display_candidate_generation_report(
|
649
|
+
self.num_prompts_per_round,
|
650
|
+
verbose=self.verbose
|
651
|
+
) as candidate_generation_report:
|
652
|
+
logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
|
653
|
+
logger.debug(f"Generating from prompt: {current_prompt}")
|
654
|
+
logger.debug(f"Current best score: {best_score:.4f}")
|
655
|
+
|
656
|
+
history_context = self._build_history_context(previous_rounds)
|
657
|
+
task_context_str = ""
|
658
|
+
analysis_instruction = ""
|
659
|
+
metric_focus_instruction = ""
|
660
|
+
improvement_point_1 = ""
|
661
|
+
|
662
|
+
if self.enable_context:
|
663
|
+
task_context_str = self._get_task_context(metric=metric)
|
664
|
+
analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
|
665
|
+
metric_focus_instruction = f"Focus on improving the score for the metric: {metric.__name__}."
|
666
|
+
improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
|
667
|
+
logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
|
668
|
+
else:
|
669
|
+
analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
|
670
|
+
metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
|
671
|
+
improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
|
672
|
+
logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
|
673
|
+
|
674
|
+
user_prompt = f"""Current prompt: {current_prompt}
|
675
|
+
Current score: {best_score}
|
676
|
+
{history_context}
|
677
|
+
{task_context_str}
|
678
|
+
|
679
|
+
{analysis_instruction}
|
680
|
+
Generate {self.num_prompts_per_round} improved versions of this prompt.
|
681
|
+
{metric_focus_instruction}
|
682
|
+
Each version should aim to:
|
683
|
+
{improvement_point_1}
|
684
|
+
2. Provide necessary context and constraints (if applicable, without relying on disabled external context).
|
685
|
+
3. Guide the model to produce the desired output format suitable for the task.
|
686
|
+
4. Remove ambiguity and unnecessary elements.
|
687
|
+
5. Maintain conciseness while being complete.
|
688
|
+
|
689
|
+
Return a valid JSON array as specified."""
|
690
|
+
|
691
|
+
try:
|
692
|
+
# Use _call_model which handles selecting reasoning_model
|
693
|
+
content = self._call_model(
|
694
|
+
messages=[
|
695
|
+
{"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
|
696
|
+
{"role": "user", "content": user_prompt}
|
697
|
+
],
|
698
|
+
is_reasoning=True,
|
699
|
+
optimization_id=optimization_id,
|
700
|
+
)
|
701
|
+
logger.debug(f"Raw response from reasoning model: {content}")
|
702
|
+
|
703
|
+
# --- Robust JSON Parsing and Validation ---
|
704
|
+
json_result = None
|
705
|
+
try:
|
706
|
+
# Try direct JSON parsing
|
707
|
+
json_result = json.loads(content)
|
708
|
+
except json.JSONDecodeError:
|
709
|
+
import re
|
710
|
+
|
711
|
+
json_match = re.search(r"\{.*\}", content, re.DOTALL)
|
712
|
+
if json_match:
|
713
|
+
try:
|
714
|
+
json_result = json.loads(json_match.group())
|
715
|
+
except json.JSONDecodeError as e:
|
716
|
+
raise ValueError(f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}")
|
717
|
+
else:
|
718
|
+
raise ValueError(f"No JSON object found in response via regex. - received: {content}")
|
719
|
+
|
720
|
+
# Validate the parsed JSON structure
|
721
|
+
if isinstance(json_result, list) and len(json_result) == 1:
|
722
|
+
json_result = json_result[0]
|
723
|
+
|
724
|
+
if not isinstance(json_result, dict) or "prompts" not in json_result:
|
725
|
+
logger.debug(f"Parsed JSON content: {json_result}")
|
726
|
+
raise ValueError(f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}")
|
727
|
+
|
728
|
+
if not isinstance(json_result["prompts"], list):
|
729
|
+
logger.debug(f"Content of 'prompts': {json_result.get('prompts')}")
|
730
|
+
raise ValueError(f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}")
|
731
|
+
|
732
|
+
# Extract and log valid prompts
|
733
|
+
valid_prompts = []
|
734
|
+
for item in json_result["prompts"]:
|
735
|
+
if (
|
736
|
+
isinstance(item, dict)
|
737
|
+
and "prompt" in item
|
738
|
+
and isinstance(item["prompt"], list)
|
739
|
+
):
|
740
|
+
prompt_text = item["prompt"]
|
741
|
+
valid_prompts.append(prompt_text)
|
742
|
+
|
743
|
+
# Log details
|
744
|
+
focus = item.get("improvement_focus", "N/A")
|
745
|
+
reasoning = item.get("reasoning", "N/A")
|
746
|
+
logger.debug(f"Generated prompt: {prompt_text}")
|
747
|
+
logger.debug(f" Improvement focus: {focus}")
|
748
|
+
logger.debug(f" Reasoning: {reasoning}")
|
749
|
+
else:
|
750
|
+
logger.warning(
|
751
|
+
f"Skipping invalid prompt item structure in JSON response: {item}"
|
752
|
+
)
|
753
|
+
|
754
|
+
if not valid_prompts:
|
755
|
+
raise ValueError("No valid prompts found in the parsed JSON response after validation.")
|
756
|
+
|
757
|
+
candidate_generation_report.set_generated_prompts(
|
758
|
+
self.num_prompts_per_round
|
759
|
+
)
|
760
|
+
|
761
|
+
return valid_prompts
|
762
|
+
# --- End Robust Parsing ---
|
763
|
+
|
764
|
+
except Exception as e:
|
765
|
+
raise ValueError(f"Unexpected error during candidate prompt generation: {e}")
|
766
|
+
|
767
|
+
def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
|
768
|
+
"""Build context from previous optimization rounds."""
|
769
|
+
if not previous_rounds:
|
770
|
+
return ""
|
771
|
+
|
772
|
+
context = "\nPrevious rounds (latest first):\n"
|
773
|
+
for round_data in reversed(previous_rounds[-3:]):
|
774
|
+
context += f"\nRound {round_data.round_number}:\n"
|
775
|
+
context += f"Best score this round: {round_data.best_score:.4f}\n"
|
776
|
+
context += "Generated prompts this round (best first):\n"
|
777
|
+
|
778
|
+
sorted_generated = sorted(
|
779
|
+
round_data.generated_prompts,
|
780
|
+
key=lambda p: p.get("score", -float("inf")),
|
781
|
+
reverse=True,
|
782
|
+
)
|
783
|
+
|
784
|
+
for p in sorted_generated[:3]:
|
785
|
+
prompt_text = p.get("prompt", "N/A")
|
786
|
+
score = p.get("score", float("nan"))
|
787
|
+
context += f"- Prompt: {prompt_text[:150]}...\n"
|
788
|
+
context += f" Avg Score: {score:.4f}\n"
|
789
|
+
return context
|
790
|
+
|
791
|
+
def _get_evaluation_subset(
|
792
|
+
self, dataset: opik.Dataset, min_size: int = 20, max_size: int = 100
|
793
|
+
) -> List[Dict[str, Any]]:
|
794
|
+
"""Get a random subset of the dataset for evaluation.
|
795
|
+
|
796
|
+
Returns:
|
797
|
+
List[Dict[str, Any]]: A list of dataset items to evaluate against
|
798
|
+
"""
|
799
|
+
try:
|
800
|
+
# Get all items from the dataset
|
801
|
+
all_items = dataset.get_items()
|
802
|
+
if not all_items:
|
803
|
+
return all_items
|
804
|
+
|
805
|
+
# Calculate subset size
|
806
|
+
total_size = len(all_items)
|
807
|
+
subset_size = min(max(min_size, int(total_size * 0.2)), max_size)
|
808
|
+
|
809
|
+
# Get random subset of items
|
810
|
+
import random
|
811
|
+
|
812
|
+
return random.sample(all_items, subset_size)
|
813
|
+
|
814
|
+
except Exception as e:
|
815
|
+
logger.warning(f"Could not create evaluation subset: {e}")
|
816
|
+
return all_items
|