opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -3
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +41 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +28 -20
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +96 -46
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +122 -37
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.0.dist-info/RECORD +50 -0
- opik_optimizer-0.9.1.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
-
from typing import Any, Callable, Dict, List, Optional,
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
5
5
|
|
6
6
|
import litellm
|
7
7
|
import opik
|
@@ -13,11 +13,13 @@ from opik.environment import get_tqdm_for_current_environment
|
|
13
13
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
14
14
|
|
15
15
|
from opik_optimizer import task_evaluator
|
16
|
+
from opik_optimizer import utils
|
16
17
|
|
17
18
|
from .. import _throttle
|
18
19
|
from ..base_optimizer import BaseOptimizer, OptimizationRound
|
19
20
|
from ..optimization_config import chat_prompt, mappers
|
20
21
|
from ..optimization_result import OptimizationResult
|
22
|
+
from ..optimizable_agent import OptimizableAgent
|
21
23
|
from . import reporting
|
22
24
|
|
23
25
|
tqdm = get_tqdm_for_current_environment()
|
@@ -35,17 +37,18 @@ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
|
35
37
|
class MetaPromptOptimizer(BaseOptimizer):
|
36
38
|
"""
|
37
39
|
The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
|
38
|
-
|
40
|
+
|
39
41
|
This algorithm is best used when you have a prompt and would like to make sure it follows best
|
40
42
|
practices.
|
41
43
|
"""
|
44
|
+
|
42
45
|
# --- Constants for Default Configuration ---
|
43
46
|
DEFAULT_ROUNDS = 3
|
44
47
|
DEFAULT_PROMPTS_PER_ROUND = 4
|
45
48
|
|
46
49
|
# --- Reasoning System Prompt ---
|
47
50
|
_REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
|
48
|
-
|
51
|
+
|
49
52
|
Focus on making the prompt more effective by:
|
50
53
|
1. Being clear and specific about what is expected
|
51
54
|
2. Providing necessary context and constraints
|
@@ -83,11 +86,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
83
86
|
rounds: int = DEFAULT_ROUNDS,
|
84
87
|
num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
|
85
88
|
num_threads: int = 12,
|
86
|
-
project_name: str = "Optimization",
|
87
89
|
verbose: int = 1,
|
88
90
|
enable_context: bool = True,
|
89
|
-
**model_kwargs,
|
90
|
-
):
|
91
|
+
**model_kwargs: Any,
|
92
|
+
) -> None:
|
91
93
|
"""
|
92
94
|
Args:
|
93
95
|
model: The model to use for evaluation
|
@@ -95,18 +97,16 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
95
97
|
rounds: Number of optimization rounds
|
96
98
|
num_prompts_per_round: Number of prompts to generate per round
|
97
99
|
num_threads: Number of threads for parallel evaluation
|
98
|
-
project_name: Optional project name for tracking
|
99
100
|
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
100
101
|
enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
|
101
102
|
**model_kwargs: Additional model parameters
|
102
103
|
"""
|
103
|
-
super().__init__(model=model,
|
104
|
+
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
104
105
|
self.reasoning_model = reasoning_model if reasoning_model is not None else model
|
105
106
|
self.rounds = rounds
|
106
107
|
self.num_prompts_per_round = num_prompts_per_round
|
107
108
|
self.num_threads = num_threads
|
108
|
-
self.
|
109
|
-
self.dataset = None
|
109
|
+
self.dataset: Optional[Dataset] = None
|
110
110
|
self._opik_client = opik_client.get_client_cached()
|
111
111
|
self.llm_call_counter = 0
|
112
112
|
self.enable_context = enable_context
|
@@ -120,6 +120,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
120
120
|
@_throttle.rate_limited(_rate_limiter)
|
121
121
|
def _call_model(
|
122
122
|
self,
|
123
|
+
project_name: str,
|
123
124
|
messages: List[Dict[str, str]],
|
124
125
|
is_reasoning: bool = False,
|
125
126
|
optimization_id: Optional[str] = None,
|
@@ -133,12 +134,18 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
133
134
|
base_max_tokens = getattr(self, "max_tokens", 1000)
|
134
135
|
|
135
136
|
# Use potentially different settings for reasoning calls
|
136
|
-
reasoning_temperature =
|
137
|
+
reasoning_temperature = (
|
138
|
+
base_temperature # Keep same temp unless specified otherwise
|
139
|
+
)
|
137
140
|
# Increase max_tokens for reasoning to ensure JSON fits, unless already high
|
138
|
-
reasoning_max_tokens =
|
141
|
+
reasoning_max_tokens = (
|
142
|
+
max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
|
143
|
+
)
|
139
144
|
|
140
145
|
llm_config_params = {
|
141
|
-
"temperature":
|
146
|
+
"temperature": (
|
147
|
+
reasoning_temperature if is_reasoning else base_temperature
|
148
|
+
),
|
142
149
|
"max_tokens": reasoning_max_tokens,
|
143
150
|
"top_p": getattr(self, "top_p", 1.0),
|
144
151
|
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
@@ -146,12 +153,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
146
153
|
}
|
147
154
|
|
148
155
|
# Prepare metadata that we want to be part of the LLM call context.
|
149
|
-
metadata_for_opik = {}
|
150
|
-
if
|
156
|
+
metadata_for_opik: Dict[str, Any] = {}
|
157
|
+
if project_name:
|
151
158
|
metadata_for_opik["project_name"] = (
|
152
|
-
|
153
|
-
)
|
154
|
-
metadata_for_opik["opik"] = {"project_name":
|
159
|
+
project_name # Top-level for general use
|
160
|
+
)
|
161
|
+
metadata_for_opik["opik"] = {"project_name": project_name}
|
155
162
|
|
156
163
|
if optimization_id:
|
157
164
|
# Also add to opik-specific structure if project_name was added
|
@@ -181,10 +188,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
181
188
|
)
|
182
189
|
|
183
190
|
response = litellm.completion(
|
184
|
-
model=model_to_use,
|
185
|
-
messages=messages,
|
191
|
+
model=model_to_use,
|
192
|
+
messages=messages,
|
186
193
|
num_retries=6,
|
187
|
-
**final_call_params
|
194
|
+
**final_call_params,
|
188
195
|
)
|
189
196
|
return response.choices[0].message.content
|
190
197
|
except litellm.exceptions.RateLimitError as e:
|
@@ -197,27 +204,26 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
197
204
|
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
198
205
|
# Log prompt length if possible? Needs access to prompt_for_llm here.
|
199
206
|
raise
|
200
|
-
except Exception
|
201
|
-
logger.error(
|
202
|
-
|
203
|
-
)
|
207
|
+
except Exception:
|
208
|
+
# logger.error(
|
209
|
+
# f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
|
210
|
+
# )
|
204
211
|
raise
|
205
212
|
|
206
|
-
|
207
|
-
def evaluate_prompt(
|
213
|
+
def _evaluate_prompt(
|
208
214
|
self,
|
209
215
|
prompt: chat_prompt.ChatPrompt,
|
210
216
|
dataset: opik.Dataset,
|
211
217
|
metric: Callable,
|
212
|
-
use_full_dataset: bool = True,
|
213
|
-
experiment_config: Optional[Dict] = None,
|
214
218
|
n_samples: Optional[int] = None,
|
219
|
+
dataset_item_ids: Optional[List[str]] = None,
|
220
|
+
experiment_config: Optional[Dict] = None,
|
221
|
+
use_full_dataset: bool = True,
|
215
222
|
optimization_id: Optional[str] = None,
|
216
|
-
|
223
|
+
**kwargs: Any,
|
217
224
|
) -> float:
|
218
225
|
"""
|
219
226
|
Args:
|
220
|
-
prompt: The prompt to evaluate
|
221
227
|
dataset: Opik Dataset to evaluate the prompt on
|
222
228
|
metric: Metric functions
|
223
229
|
use_full_dataset: Whether to use the full dataset or a subset
|
@@ -249,16 +255,18 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
249
255
|
else:
|
250
256
|
subset_size = None # Use all items for final checks
|
251
257
|
logger.debug("Using full dataset for evaluation")
|
252
|
-
|
258
|
+
|
253
259
|
experiment_config = experiment_config or {}
|
254
260
|
experiment_config = {
|
255
261
|
**experiment_config,
|
256
262
|
**{
|
257
263
|
"optimizer": self.__class__.__name__,
|
258
|
-
"
|
264
|
+
"agent_class": self.agent_class.__name__,
|
265
|
+
"agent_config": prompt.to_dict(),
|
266
|
+
"metric": getattr(metric, "__name__", str(metric)),
|
259
267
|
"dataset": dataset.name,
|
260
268
|
"configuration": {
|
261
|
-
"prompt": prompt.
|
269
|
+
"prompt": prompt.get_messages(),
|
262
270
|
"n_samples": subset_size,
|
263
271
|
"use_full_dataset": use_full_dataset,
|
264
272
|
},
|
@@ -269,25 +277,33 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
269
277
|
|
270
278
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
|
271
279
|
# --- Step 1: Prepare the prompt for the LLM ---
|
272
|
-
messages = [
|
273
|
-
|
274
|
-
|
275
|
-
|
280
|
+
# messages = [
|
281
|
+
# {
|
282
|
+
# "role": item["role"],
|
283
|
+
# "content": item["content"].format(**dataset_item),
|
284
|
+
# }
|
285
|
+
# for item in prompt.get_messages()
|
286
|
+
# ]
|
287
|
+
# Step 1: create the agent
|
288
|
+
new_prompt = prompt.copy()
|
289
|
+
messages = new_prompt.get_messages(dataset_item)
|
290
|
+
new_prompt.set_messages(messages)
|
291
|
+
agent = self.agent_class(new_prompt)
|
276
292
|
|
277
293
|
# --- Step 2: Call the model ---
|
278
294
|
try:
|
279
|
-
logger.debug(
|
280
|
-
|
281
|
-
messages=messages,
|
282
|
-
is_reasoning=False,
|
283
|
-
optimization_id=optimization_id,
|
295
|
+
logger.debug(
|
296
|
+
f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
|
284
297
|
)
|
298
|
+
raw_model_output = agent.invoke(messages)
|
285
299
|
logger.debug(f"LLM raw response length: {len(raw_model_output)}")
|
286
300
|
logger.debug(f"LLM raw output: {raw_model_output}")
|
287
301
|
except Exception as e:
|
288
302
|
logger.error(f"Error calling model with prompt: {e}")
|
289
303
|
logger.error(f"Failed prompt: {messages}")
|
290
|
-
logger.error(
|
304
|
+
logger.error(
|
305
|
+
f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
|
306
|
+
)
|
291
307
|
raise
|
292
308
|
|
293
309
|
# --- Step 3: Clean the model's output before metric evaluation ---
|
@@ -300,14 +316,15 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
300
316
|
|
301
317
|
# Use dataset's get_items with limit for sampling
|
302
318
|
logger.debug(
|
303
|
-
f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {metric
|
319
|
+
f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {getattr(metric, '__name__', str(metric))}"
|
304
320
|
)
|
305
321
|
score = task_evaluator.evaluate(
|
306
322
|
dataset=dataset,
|
307
323
|
metric=metric,
|
308
324
|
evaluated_task=llm_task,
|
325
|
+
dataset_item_ids=dataset_item_ids,
|
309
326
|
num_threads=self.num_threads,
|
310
|
-
project_name=self.project_name,
|
327
|
+
project_name=self.agent_class.project_name,
|
311
328
|
n_samples=subset_size, # Use subset_size for trials, None for full dataset
|
312
329
|
experiment_config=experiment_config,
|
313
330
|
optimization_id=optimization_id,
|
@@ -316,7 +333,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
316
333
|
logger.debug(f"Evaluation score: {score:.4f}")
|
317
334
|
return score
|
318
335
|
|
319
|
-
def optimize_prompt(
|
336
|
+
def optimize_prompt(
|
320
337
|
self,
|
321
338
|
prompt: chat_prompt.ChatPrompt,
|
322
339
|
dataset: Dataset,
|
@@ -324,13 +341,13 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
324
341
|
experiment_config: Optional[Dict] = None,
|
325
342
|
n_samples: Optional[int] = None,
|
326
343
|
auto_continue: bool = False,
|
327
|
-
|
344
|
+
agent_class: Optional[Type[OptimizableAgent]] = None,
|
345
|
+
**kwargs: Any,
|
328
346
|
) -> OptimizationResult:
|
329
347
|
"""
|
330
348
|
Optimize a prompt using meta-reasoning.
|
331
349
|
|
332
350
|
Args:
|
333
|
-
prompt: The prompt to optimize
|
334
351
|
dataset: The dataset to evaluate against
|
335
352
|
metric: The metric to use for evaluation
|
336
353
|
experiment_config: A dictionary to log with the experiments
|
@@ -341,8 +358,27 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
341
358
|
Returns:
|
342
359
|
OptimizationResult: Structured result containing optimization details
|
343
360
|
"""
|
344
|
-
|
345
|
-
|
361
|
+
if not isinstance(prompt, chat_prompt.ChatPrompt):
|
362
|
+
raise ValueError("Prompt must be a ChatPrompt object")
|
363
|
+
|
364
|
+
if not isinstance(dataset, Dataset):
|
365
|
+
raise ValueError("Dataset must be a Dataset object")
|
366
|
+
|
367
|
+
if not callable(metric):
|
368
|
+
raise ValueError(
|
369
|
+
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
370
|
+
)
|
371
|
+
|
372
|
+
if prompt.model is None:
|
373
|
+
prompt.model = self.model
|
374
|
+
if prompt.model_kwargs is None:
|
375
|
+
prompt.model_kwargs = self.model_kwargs
|
376
|
+
|
377
|
+
if agent_class is None:
|
378
|
+
self.agent_class = utils.create_litellm_agent_class(prompt)
|
379
|
+
else:
|
380
|
+
self.agent_class = agent_class
|
381
|
+
|
346
382
|
total_items = len(dataset.get_items())
|
347
383
|
if n_samples is not None and n_samples > total_items:
|
348
384
|
logger.warning(
|
@@ -350,21 +386,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
350
386
|
)
|
351
387
|
n_samples = None
|
352
388
|
|
353
|
-
reporting.display_configuration(
|
354
|
-
messages=prompt.formatted_messages,
|
355
|
-
optimizer_config={
|
356
|
-
"optimizer": self.__class__.__name__,
|
357
|
-
"n_samples": n_samples,
|
358
|
-
"auto_continue": auto_continue
|
359
|
-
},
|
360
|
-
verbose=self.verbose
|
361
|
-
)
|
362
|
-
|
363
389
|
optimization = None
|
364
390
|
try:
|
365
391
|
optimization = self._opik_client.create_optimization(
|
366
392
|
dataset_name=dataset.name,
|
367
|
-
objective_name=metric
|
393
|
+
objective_name=getattr(metric, "__name__", str(metric)),
|
368
394
|
metadata={"optimizer": self.__class__.__name__},
|
369
395
|
)
|
370
396
|
logger.debug(f"Created optimization with ID: {optimization.id}")
|
@@ -374,9 +400,26 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
374
400
|
)
|
375
401
|
optimization = None
|
376
402
|
|
403
|
+
reporting.display_header(
|
404
|
+
algorithm=self.__class__.__name__,
|
405
|
+
optimization_id=optimization.id if optimization is not None else None,
|
406
|
+
dataset_id=dataset.id,
|
407
|
+
verbose=self.verbose,
|
408
|
+
)
|
409
|
+
reporting.display_configuration(
|
410
|
+
messages=prompt.get_messages(),
|
411
|
+
optimizer_config={
|
412
|
+
"optimizer": self.__class__.__name__,
|
413
|
+
"n_samples": n_samples,
|
414
|
+
"auto_continue": auto_continue,
|
415
|
+
},
|
416
|
+
verbose=self.verbose,
|
417
|
+
)
|
418
|
+
|
377
419
|
try:
|
420
|
+
optimization_id = optimization.id if optimization is not None else None
|
378
421
|
result = self._optimize_prompt(
|
379
|
-
optimization_id=
|
422
|
+
optimization_id=optimization_id,
|
380
423
|
prompt=prompt,
|
381
424
|
dataset=dataset,
|
382
425
|
metric=metric,
|
@@ -398,30 +441,33 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
398
441
|
|
399
442
|
def _optimize_prompt(
|
400
443
|
self,
|
401
|
-
optimization_id: str,
|
444
|
+
optimization_id: Optional[str],
|
402
445
|
prompt: chat_prompt.ChatPrompt,
|
403
446
|
dataset: Dataset,
|
404
447
|
metric: Callable,
|
405
448
|
experiment_config: Optional[Dict],
|
406
|
-
n_samples: int,
|
449
|
+
n_samples: Optional[int],
|
407
450
|
auto_continue: bool,
|
408
|
-
**kwargs,
|
451
|
+
**kwargs: Any,
|
409
452
|
) -> OptimizationResult:
|
410
453
|
self.auto_continue = auto_continue
|
411
454
|
self.dataset = dataset
|
412
455
|
self.prompt = prompt
|
413
|
-
self.llm_call_counter = 0
|
456
|
+
self.llm_call_counter = 0 # Reset counter for run
|
457
|
+
initial_prompt = prompt
|
414
458
|
|
415
|
-
current_prompt = prompt
|
459
|
+
current_prompt = prompt
|
416
460
|
experiment_config = experiment_config or {}
|
417
461
|
experiment_config = {
|
418
462
|
**experiment_config,
|
419
463
|
**{
|
420
464
|
"optimizer": self.__class__.__name__,
|
421
|
-
"
|
422
|
-
"
|
465
|
+
"agent_class": self.agent_class.__name__,
|
466
|
+
"agent_config": prompt.to_dict(),
|
467
|
+
"metric": getattr(metric, "__name__", str(metric)),
|
468
|
+
"dataset": dataset.name,
|
423
469
|
"configuration": {
|
424
|
-
"prompt":
|
470
|
+
"prompt": prompt.get_messages(),
|
425
471
|
"rounds": self.rounds,
|
426
472
|
"num_prompts_per_round": self.num_prompts_per_round,
|
427
473
|
},
|
@@ -429,8 +475,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
429
475
|
}
|
430
476
|
|
431
477
|
with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
|
432
|
-
initial_score = self.
|
433
|
-
prompt
|
478
|
+
initial_score = self._evaluate_prompt(
|
479
|
+
prompt,
|
434
480
|
optimization_id=optimization_id,
|
435
481
|
dataset=dataset,
|
436
482
|
metric=metric,
|
@@ -441,20 +487,22 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
441
487
|
)
|
442
488
|
best_score = initial_score
|
443
489
|
best_prompt = current_prompt
|
444
|
-
rounds = []
|
490
|
+
rounds: List[OptimizationRound] = []
|
445
491
|
|
446
492
|
baseline_reporter.set_score(initial_score)
|
447
493
|
|
448
494
|
reporting.display_optimization_start_message(verbose=self.verbose)
|
449
|
-
with reporting.display_round_progress(
|
495
|
+
with reporting.display_round_progress(
|
496
|
+
self.rounds, verbose=self.verbose
|
497
|
+
) as round_reporter:
|
450
498
|
for round_num in range(self.rounds):
|
451
|
-
|
452
499
|
round_reporter.round_start(round_num)
|
453
500
|
previous_best_score = best_score
|
454
|
-
|
501
|
+
|
455
502
|
# Step 1. Create a set of candidate prompts
|
456
503
|
try:
|
457
504
|
candidate_prompts = self._generate_candidate_prompts(
|
505
|
+
project_name=self.agent_class.project_name,
|
458
506
|
current_prompt=best_prompt,
|
459
507
|
best_score=best_score,
|
460
508
|
round_num=round_num,
|
@@ -467,14 +515,19 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
467
515
|
continue
|
468
516
|
|
469
517
|
# Step 2. Score each candidate prompt
|
470
|
-
prompt_scores = []
|
518
|
+
prompt_scores: List[Tuple[chat_prompt.ChatPrompt, float]] = []
|
471
519
|
for candidate_count, prompt in enumerate(candidate_prompts):
|
472
|
-
with reporting.display_prompt_candidate_scoring_report(
|
520
|
+
with reporting.display_prompt_candidate_scoring_report(
|
521
|
+
verbose=self.verbose
|
522
|
+
) as eval_report:
|
473
523
|
eval_report.set_generated_prompts(candidate_count, prompt)
|
474
524
|
|
525
|
+
new_prompt = current_prompt.copy()
|
526
|
+
new_prompt.set_messages(prompt.get_messages())
|
527
|
+
|
475
528
|
try:
|
476
|
-
prompt_score = self.
|
477
|
-
prompt=
|
529
|
+
prompt_score = self._evaluate_prompt(
|
530
|
+
prompt=new_prompt,
|
478
531
|
optimization_id=optimization_id,
|
479
532
|
dataset=dataset,
|
480
533
|
metric=metric,
|
@@ -485,34 +538,37 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
485
538
|
)
|
486
539
|
|
487
540
|
eval_report.set_final_score(best_score, prompt_score)
|
488
|
-
except Exception
|
489
|
-
|
541
|
+
except Exception:
|
542
|
+
print("Failed evaluating agent; continuing...")
|
543
|
+
prompt_score = 0
|
490
544
|
|
491
545
|
prompt_scores.append((prompt, prompt_score))
|
492
|
-
|
546
|
+
|
493
547
|
# Step 3. Identify potential improvements
|
494
548
|
if not prompt_scores:
|
495
|
-
logger.warning(
|
549
|
+
logger.warning(
|
550
|
+
"No prompts were successfully evaluated in this round"
|
551
|
+
)
|
496
552
|
break
|
497
553
|
|
498
554
|
prompt_scores.sort(key=lambda x: x[1], reverse=True)
|
499
|
-
best_candidate_this_round, best_cand_score_avg =
|
500
|
-
|
555
|
+
best_candidate_this_round, best_cand_score_avg = prompt_scores[0]
|
556
|
+
improvement = self._calculate_improvement(
|
557
|
+
best_cand_score_avg, best_score
|
501
558
|
)
|
502
|
-
|
503
|
-
|
504
|
-
|
559
|
+
round_reporter.round_end(round_num, best_cand_score_avg, best_score)
|
560
|
+
|
505
561
|
round_data = self._create_round_data(
|
506
562
|
round_num=round_num,
|
507
|
-
current_best_prompt=
|
508
|
-
current_best_score=
|
509
|
-
best_prompt_overall=
|
563
|
+
current_best_prompt=best_prompt,
|
564
|
+
current_best_score=best_score,
|
565
|
+
best_prompt_overall=best_prompt,
|
510
566
|
evaluated_candidates=prompt_scores,
|
511
567
|
previous_best_score=previous_best_score,
|
512
568
|
improvement_this_round=improvement,
|
513
569
|
)
|
514
570
|
rounds.append(round_data)
|
515
|
-
self._add_to_history(round_data
|
571
|
+
self._add_to_history(round_data)
|
516
572
|
|
517
573
|
if improvement > 0:
|
518
574
|
best_score = best_cand_score_avg
|
@@ -521,17 +577,21 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
521
577
|
reporting.display_result(
|
522
578
|
initial_score,
|
523
579
|
best_score,
|
524
|
-
best_prompt,
|
525
|
-
verbose=self.verbose
|
580
|
+
best_prompt.get_messages() if best_prompt is not None else [],
|
581
|
+
verbose=self.verbose,
|
526
582
|
)
|
527
583
|
|
528
584
|
return self._create_result(
|
529
585
|
metric,
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
586
|
+
initial_prompt=initial_prompt.get_messages()
|
587
|
+
if initial_prompt is not None
|
588
|
+
else [],
|
589
|
+
best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
|
590
|
+
best_score=best_score,
|
591
|
+
initial_score=initial_score,
|
592
|
+
rounds=rounds,
|
593
|
+
dataset_id=dataset.id,
|
594
|
+
optimization_id=optimization_id,
|
535
595
|
)
|
536
596
|
|
537
597
|
def _calculate_improvement(
|
@@ -550,7 +610,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
550
610
|
current_best_prompt: chat_prompt.ChatPrompt,
|
551
611
|
current_best_score: float,
|
552
612
|
best_prompt_overall: chat_prompt.ChatPrompt,
|
553
|
-
evaluated_candidates: List[
|
613
|
+
evaluated_candidates: List[Tuple[chat_prompt.ChatPrompt, float]],
|
554
614
|
previous_best_score: float,
|
555
615
|
improvement_this_round: float,
|
556
616
|
) -> OptimizationRound:
|
@@ -562,7 +622,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
562
622
|
)
|
563
623
|
generated_prompts_log.append(
|
564
624
|
{
|
565
|
-
"prompt": prompt,
|
625
|
+
"prompt": prompt.get_messages(),
|
566
626
|
"score": score,
|
567
627
|
"improvement": improvement_vs_prev,
|
568
628
|
}
|
@@ -581,21 +641,21 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
581
641
|
def _create_result(
|
582
642
|
self,
|
583
643
|
metric: Callable,
|
584
|
-
|
585
|
-
best_prompt: str,
|
644
|
+
initial_prompt: List[Dict[str, str]],
|
645
|
+
best_prompt: List[Dict[str, str]],
|
586
646
|
best_score: float,
|
587
647
|
initial_score: float,
|
588
648
|
rounds: List[OptimizationRound],
|
649
|
+
dataset_id: Optional[str],
|
650
|
+
optimization_id: Optional[str],
|
589
651
|
) -> OptimizationResult:
|
590
652
|
"""Create the final OptimizationResult object."""
|
591
653
|
details = {
|
592
|
-
"initial_prompt": prompt,
|
593
|
-
"initial_score": initial_score,
|
594
654
|
"final_prompt": best_prompt,
|
595
655
|
"final_score": best_score,
|
596
656
|
"rounds": rounds,
|
597
657
|
"total_rounds": len(rounds),
|
598
|
-
"metric_name": metric
|
658
|
+
"metric_name": getattr(metric, "__name__", str(metric)),
|
599
659
|
"model": self.model,
|
600
660
|
"temperature": self.model_kwargs.get("temperature"),
|
601
661
|
}
|
@@ -604,9 +664,13 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
604
664
|
optimizer=self.__class__.__name__,
|
605
665
|
prompt=best_prompt,
|
606
666
|
score=best_score,
|
607
|
-
|
667
|
+
initial_prompt=initial_prompt,
|
668
|
+
initial_score=initial_score,
|
669
|
+
metric_name=getattr(metric, "__name__", str(metric)),
|
608
670
|
details=details,
|
609
|
-
llm_calls=self.llm_call_counter
|
671
|
+
llm_calls=self.llm_call_counter,
|
672
|
+
dataset_id=dataset_id,
|
673
|
+
optimization_id=optimization_id,
|
610
674
|
)
|
611
675
|
|
612
676
|
def _get_task_context(self, metric: Callable) -> str:
|
@@ -632,7 +696,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
632
696
|
context += f"Dataset fields (includes both input and optionally the expected output): {', '.join([x for x in sample.keys() if x != 'id'])}\n"
|
633
697
|
context += f"Evaluation Metric:\n{metrics_str}\n"
|
634
698
|
context += f"\nExample:\n{json.dumps(sample)}\n"
|
635
|
-
|
699
|
+
|
636
700
|
return context
|
637
701
|
|
638
702
|
def _generate_candidate_prompts(
|
@@ -643,14 +707,14 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
643
707
|
previous_rounds: List[OptimizationRound],
|
644
708
|
metric: Callable,
|
645
709
|
optimization_id: Optional[str] = None,
|
646
|
-
|
710
|
+
project_name: Optional[str] = None,
|
711
|
+
) -> List[chat_prompt.ChatPrompt]:
|
647
712
|
"""Generate candidate prompts using meta-prompting."""
|
648
713
|
with reporting.display_candidate_generation_report(
|
649
|
-
self.num_prompts_per_round,
|
650
|
-
|
651
|
-
) as candidate_generation_report:
|
714
|
+
self.num_prompts_per_round, verbose=self.verbose
|
715
|
+
) as candidate_generation_report:
|
652
716
|
logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
|
653
|
-
logger.debug(f"Generating from prompt: {current_prompt}")
|
717
|
+
logger.debug(f"Generating from prompt: {current_prompt.get_messages()}")
|
654
718
|
logger.debug(f"Current best score: {best_score:.4f}")
|
655
719
|
|
656
720
|
history_context = self._build_history_context(previous_rounds)
|
@@ -662,16 +726,22 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
662
726
|
if self.enable_context:
|
663
727
|
task_context_str = self._get_task_context(metric=metric)
|
664
728
|
analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
|
665
|
-
metric_focus_instruction =
|
729
|
+
metric_focus_instruction = (
|
730
|
+
f"Focus on improving the score for the metric: {metric.__name__}."
|
731
|
+
)
|
666
732
|
improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
|
667
|
-
logger.debug(
|
733
|
+
logger.debug(
|
734
|
+
"Task context and metric-specific instructions enabled for reasoning prompt."
|
735
|
+
)
|
668
736
|
else:
|
669
|
-
analysis_instruction = "Analyze the history of scores and the current prompt
|
737
|
+
analysis_instruction = "Analyze the history of scores and the current prompt's performance."
|
670
738
|
metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
|
671
739
|
improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
|
672
|
-
logger.debug(
|
740
|
+
logger.debug(
|
741
|
+
"Task context and metric-specific instructions disabled for reasoning prompt."
|
742
|
+
)
|
673
743
|
|
674
|
-
user_prompt = f"""Current prompt: {current_prompt}
|
744
|
+
user_prompt = f"""Current prompt: {current_prompt.get_messages()}
|
675
745
|
Current score: {best_score}
|
676
746
|
{history_context}
|
677
747
|
{task_context_str}
|
@@ -691,9 +761,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
691
761
|
try:
|
692
762
|
# Use _call_model which handles selecting reasoning_model
|
693
763
|
content = self._call_model(
|
764
|
+
project_name,
|
694
765
|
messages=[
|
695
766
|
{"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
|
696
|
-
{"role": "user", "content": user_prompt}
|
767
|
+
{"role": "user", "content": user_prompt},
|
697
768
|
],
|
698
769
|
is_reasoning=True,
|
699
770
|
optimization_id=optimization_id,
|
@@ -713,9 +784,13 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
713
784
|
try:
|
714
785
|
json_result = json.loads(json_match.group())
|
715
786
|
except json.JSONDecodeError as e:
|
716
|
-
raise ValueError(
|
787
|
+
raise ValueError(
|
788
|
+
f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}"
|
789
|
+
)
|
717
790
|
else:
|
718
|
-
raise ValueError(
|
791
|
+
raise ValueError(
|
792
|
+
f"No JSON object found in response via regex. - received: {content}"
|
793
|
+
)
|
719
794
|
|
720
795
|
# Validate the parsed JSON structure
|
721
796
|
if isinstance(json_result, list) and len(json_result) == 1:
|
@@ -723,27 +798,46 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
723
798
|
|
724
799
|
if not isinstance(json_result, dict) or "prompts" not in json_result:
|
725
800
|
logger.debug(f"Parsed JSON content: {json_result}")
|
726
|
-
raise ValueError(
|
801
|
+
raise ValueError(
|
802
|
+
f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}"
|
803
|
+
)
|
727
804
|
|
728
805
|
if not isinstance(json_result["prompts"], list):
|
729
806
|
logger.debug(f"Content of 'prompts': {json_result.get('prompts')}")
|
730
|
-
raise ValueError(
|
807
|
+
raise ValueError(
|
808
|
+
f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}"
|
809
|
+
)
|
731
810
|
|
732
811
|
# Extract and log valid prompts
|
733
|
-
valid_prompts = []
|
812
|
+
valid_prompts: List[chat_prompt.ChatPrompt] = []
|
734
813
|
for item in json_result["prompts"]:
|
735
814
|
if (
|
736
815
|
isinstance(item, dict)
|
737
816
|
and "prompt" in item
|
738
817
|
and isinstance(item["prompt"], list)
|
739
818
|
):
|
740
|
-
|
741
|
-
|
742
|
-
|
819
|
+
# NOTE: might be brittle
|
820
|
+
if current_prompt.user:
|
821
|
+
user_text = current_prompt.user
|
822
|
+
else:
|
823
|
+
if current_prompt.messages is not None:
|
824
|
+
user_text = current_prompt.messages[-1]["content"]
|
825
|
+
else:
|
826
|
+
raise Exception(
|
827
|
+
"User content not found in chat-prompt!"
|
828
|
+
)
|
829
|
+
|
830
|
+
valid_prompts.append(
|
831
|
+
chat_prompt.ChatPrompt(
|
832
|
+
system=item["prompt"][0]["content"],
|
833
|
+
user=user_text,
|
834
|
+
)
|
835
|
+
)
|
836
|
+
|
743
837
|
# Log details
|
744
838
|
focus = item.get("improvement_focus", "N/A")
|
745
839
|
reasoning = item.get("reasoning", "N/A")
|
746
|
-
logger.debug(f"Generated prompt: {
|
840
|
+
logger.debug(f"Generated prompt: {item['prompt']}")
|
747
841
|
logger.debug(f" Improvement focus: {focus}")
|
748
842
|
logger.debug(f" Reasoning: {reasoning}")
|
749
843
|
else:
|
@@ -752,17 +846,19 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
752
846
|
)
|
753
847
|
|
754
848
|
if not valid_prompts:
|
755
|
-
raise ValueError(
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
)
|
760
|
-
|
849
|
+
raise ValueError(
|
850
|
+
"No valid prompts found in the parsed JSON response after validation."
|
851
|
+
)
|
852
|
+
|
853
|
+
candidate_generation_report.set_generated_prompts()
|
854
|
+
|
761
855
|
return valid_prompts
|
762
856
|
# --- End Robust Parsing ---
|
763
857
|
|
764
858
|
except Exception as e:
|
765
|
-
raise ValueError(
|
859
|
+
raise ValueError(
|
860
|
+
f"Unexpected error during candidate prompt generation: {e}"
|
861
|
+
)
|
766
862
|
|
767
863
|
def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
|
768
864
|
"""Build context from previous optimization rounds."""
|