opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -5
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +38 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +22 -13
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +89 -58
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +117 -14
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.1.dist-info/RECORD +50 -0
- opik_optimizer-0.9.2.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
|
|
1
1
|
import json
|
2
|
-
import copy
|
3
2
|
import logging
|
4
3
|
import os
|
5
|
-
from typing import Any, Callable, Dict, List, Optional,
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
6
5
|
|
7
6
|
import litellm
|
8
7
|
import opik
|
@@ -14,11 +13,13 @@ from opik.environment import get_tqdm_for_current_environment
|
|
14
13
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
15
14
|
|
16
15
|
from opik_optimizer import task_evaluator
|
16
|
+
from opik_optimizer import utils
|
17
17
|
|
18
18
|
from .. import _throttle
|
19
19
|
from ..base_optimizer import BaseOptimizer, OptimizationRound
|
20
20
|
from ..optimization_config import chat_prompt, mappers
|
21
21
|
from ..optimization_result import OptimizationResult
|
22
|
+
from ..optimizable_agent import OptimizableAgent
|
22
23
|
from . import reporting
|
23
24
|
|
24
25
|
tqdm = get_tqdm_for_current_environment()
|
@@ -36,17 +37,18 @@ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
|
36
37
|
class MetaPromptOptimizer(BaseOptimizer):
|
37
38
|
"""
|
38
39
|
The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
|
39
|
-
|
40
|
+
|
40
41
|
This algorithm is best used when you have a prompt and would like to make sure it follows best
|
41
42
|
practices.
|
42
43
|
"""
|
44
|
+
|
43
45
|
# --- Constants for Default Configuration ---
|
44
46
|
DEFAULT_ROUNDS = 3
|
45
47
|
DEFAULT_PROMPTS_PER_ROUND = 4
|
46
48
|
|
47
49
|
# --- Reasoning System Prompt ---
|
48
50
|
_REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
|
49
|
-
|
51
|
+
|
50
52
|
Focus on making the prompt more effective by:
|
51
53
|
1. Being clear and specific about what is expected
|
52
54
|
2. Providing necessary context and constraints
|
@@ -84,11 +86,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
84
86
|
rounds: int = DEFAULT_ROUNDS,
|
85
87
|
num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
|
86
88
|
num_threads: int = 12,
|
87
|
-
project_name: str = "Optimization",
|
88
89
|
verbose: int = 1,
|
89
90
|
enable_context: bool = True,
|
90
|
-
**model_kwargs,
|
91
|
-
):
|
91
|
+
**model_kwargs: Any,
|
92
|
+
) -> None:
|
92
93
|
"""
|
93
94
|
Args:
|
94
95
|
model: The model to use for evaluation
|
@@ -96,18 +97,16 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
96
97
|
rounds: Number of optimization rounds
|
97
98
|
num_prompts_per_round: Number of prompts to generate per round
|
98
99
|
num_threads: Number of threads for parallel evaluation
|
99
|
-
project_name: Optional project name for tracking
|
100
100
|
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
101
101
|
enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
|
102
102
|
**model_kwargs: Additional model parameters
|
103
103
|
"""
|
104
|
-
super().__init__(model=model,
|
104
|
+
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
105
105
|
self.reasoning_model = reasoning_model if reasoning_model is not None else model
|
106
106
|
self.rounds = rounds
|
107
107
|
self.num_prompts_per_round = num_prompts_per_round
|
108
108
|
self.num_threads = num_threads
|
109
|
-
self.
|
110
|
-
self.dataset = None
|
109
|
+
self.dataset: Optional[Dataset] = None
|
111
110
|
self._opik_client = opik_client.get_client_cached()
|
112
111
|
self.llm_call_counter = 0
|
113
112
|
self.enable_context = enable_context
|
@@ -121,6 +120,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
121
120
|
@_throttle.rate_limited(_rate_limiter)
|
122
121
|
def _call_model(
|
123
122
|
self,
|
123
|
+
project_name: str,
|
124
124
|
messages: List[Dict[str, str]],
|
125
125
|
is_reasoning: bool = False,
|
126
126
|
optimization_id: Optional[str] = None,
|
@@ -134,12 +134,18 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
134
134
|
base_max_tokens = getattr(self, "max_tokens", 1000)
|
135
135
|
|
136
136
|
# Use potentially different settings for reasoning calls
|
137
|
-
reasoning_temperature =
|
137
|
+
reasoning_temperature = (
|
138
|
+
base_temperature # Keep same temp unless specified otherwise
|
139
|
+
)
|
138
140
|
# Increase max_tokens for reasoning to ensure JSON fits, unless already high
|
139
|
-
reasoning_max_tokens =
|
141
|
+
reasoning_max_tokens = (
|
142
|
+
max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
|
143
|
+
)
|
140
144
|
|
141
145
|
llm_config_params = {
|
142
|
-
"temperature":
|
146
|
+
"temperature": (
|
147
|
+
reasoning_temperature if is_reasoning else base_temperature
|
148
|
+
),
|
143
149
|
"max_tokens": reasoning_max_tokens,
|
144
150
|
"top_p": getattr(self, "top_p", 1.0),
|
145
151
|
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
@@ -147,12 +153,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
147
153
|
}
|
148
154
|
|
149
155
|
# Prepare metadata that we want to be part of the LLM call context.
|
150
|
-
metadata_for_opik = {}
|
151
|
-
if
|
156
|
+
metadata_for_opik: Dict[str, Any] = {}
|
157
|
+
if project_name:
|
152
158
|
metadata_for_opik["project_name"] = (
|
153
|
-
|
154
|
-
)
|
155
|
-
metadata_for_opik["opik"] = {"project_name":
|
159
|
+
project_name # Top-level for general use
|
160
|
+
)
|
161
|
+
metadata_for_opik["opik"] = {"project_name": project_name}
|
156
162
|
|
157
163
|
if optimization_id:
|
158
164
|
# Also add to opik-specific structure if project_name was added
|
@@ -182,10 +188,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
182
188
|
)
|
183
189
|
|
184
190
|
response = litellm.completion(
|
185
|
-
model=model_to_use,
|
186
|
-
messages=messages,
|
191
|
+
model=model_to_use,
|
192
|
+
messages=messages,
|
187
193
|
num_retries=6,
|
188
|
-
**final_call_params
|
194
|
+
**final_call_params,
|
189
195
|
)
|
190
196
|
return response.choices[0].message.content
|
191
197
|
except litellm.exceptions.RateLimitError as e:
|
@@ -198,27 +204,26 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
198
204
|
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
199
205
|
# Log prompt length if possible? Needs access to prompt_for_llm here.
|
200
206
|
raise
|
201
|
-
except Exception
|
202
|
-
logger.error(
|
203
|
-
|
204
|
-
)
|
207
|
+
except Exception:
|
208
|
+
# logger.error(
|
209
|
+
# f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
|
210
|
+
# )
|
205
211
|
raise
|
206
212
|
|
207
|
-
|
208
|
-
def evaluate_prompt(
|
213
|
+
def _evaluate_prompt(
|
209
214
|
self,
|
210
215
|
prompt: chat_prompt.ChatPrompt,
|
211
216
|
dataset: opik.Dataset,
|
212
217
|
metric: Callable,
|
213
|
-
use_full_dataset: bool = True,
|
214
|
-
experiment_config: Optional[Dict] = None,
|
215
218
|
n_samples: Optional[int] = None,
|
219
|
+
dataset_item_ids: Optional[List[str]] = None,
|
220
|
+
experiment_config: Optional[Dict] = None,
|
221
|
+
use_full_dataset: bool = True,
|
216
222
|
optimization_id: Optional[str] = None,
|
217
|
-
|
223
|
+
**kwargs: Any,
|
218
224
|
) -> float:
|
219
225
|
"""
|
220
226
|
Args:
|
221
|
-
prompt: The prompt to evaluate
|
222
227
|
dataset: Opik Dataset to evaluate the prompt on
|
223
228
|
metric: Metric functions
|
224
229
|
use_full_dataset: Whether to use the full dataset or a subset
|
@@ -250,16 +255,18 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
250
255
|
else:
|
251
256
|
subset_size = None # Use all items for final checks
|
252
257
|
logger.debug("Using full dataset for evaluation")
|
253
|
-
|
258
|
+
|
254
259
|
experiment_config = experiment_config or {}
|
255
260
|
experiment_config = {
|
256
261
|
**experiment_config,
|
257
262
|
**{
|
258
263
|
"optimizer": self.__class__.__name__,
|
259
|
-
"
|
264
|
+
"agent_class": self.agent_class.__name__,
|
265
|
+
"agent_config": prompt.to_dict(),
|
266
|
+
"metric": getattr(metric, "__name__", str(metric)),
|
260
267
|
"dataset": dataset.name,
|
261
268
|
"configuration": {
|
262
|
-
"prompt": prompt.
|
269
|
+
"prompt": prompt.get_messages(),
|
263
270
|
"n_samples": subset_size,
|
264
271
|
"use_full_dataset": use_full_dataset,
|
265
272
|
},
|
@@ -270,25 +277,33 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
270
277
|
|
271
278
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
|
272
279
|
# --- Step 1: Prepare the prompt for the LLM ---
|
273
|
-
messages = [
|
274
|
-
|
275
|
-
|
276
|
-
|
280
|
+
# messages = [
|
281
|
+
# {
|
282
|
+
# "role": item["role"],
|
283
|
+
# "content": item["content"].format(**dataset_item),
|
284
|
+
# }
|
285
|
+
# for item in prompt.get_messages()
|
286
|
+
# ]
|
287
|
+
# Step 1: create the agent
|
288
|
+
new_prompt = prompt.copy()
|
289
|
+
messages = new_prompt.get_messages(dataset_item)
|
290
|
+
new_prompt.set_messages(messages)
|
291
|
+
agent = self.agent_class(new_prompt)
|
277
292
|
|
278
293
|
# --- Step 2: Call the model ---
|
279
294
|
try:
|
280
|
-
logger.debug(
|
281
|
-
|
282
|
-
messages=messages,
|
283
|
-
is_reasoning=False,
|
284
|
-
optimization_id=optimization_id,
|
295
|
+
logger.debug(
|
296
|
+
f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
|
285
297
|
)
|
298
|
+
raw_model_output = agent.invoke(messages)
|
286
299
|
logger.debug(f"LLM raw response length: {len(raw_model_output)}")
|
287
300
|
logger.debug(f"LLM raw output: {raw_model_output}")
|
288
301
|
except Exception as e:
|
289
302
|
logger.error(f"Error calling model with prompt: {e}")
|
290
303
|
logger.error(f"Failed prompt: {messages}")
|
291
|
-
logger.error(
|
304
|
+
logger.error(
|
305
|
+
f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
|
306
|
+
)
|
292
307
|
raise
|
293
308
|
|
294
309
|
# --- Step 3: Clean the model's output before metric evaluation ---
|
@@ -307,8 +322,9 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
307
322
|
dataset=dataset,
|
308
323
|
metric=metric,
|
309
324
|
evaluated_task=llm_task,
|
325
|
+
dataset_item_ids=dataset_item_ids,
|
310
326
|
num_threads=self.num_threads,
|
311
|
-
project_name=self.project_name,
|
327
|
+
project_name=self.agent_class.project_name,
|
312
328
|
n_samples=subset_size, # Use subset_size for trials, None for full dataset
|
313
329
|
experiment_config=experiment_config,
|
314
330
|
optimization_id=optimization_id,
|
@@ -317,7 +333,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
317
333
|
logger.debug(f"Evaluation score: {score:.4f}")
|
318
334
|
return score
|
319
335
|
|
320
|
-
def optimize_prompt(
|
336
|
+
def optimize_prompt(
|
321
337
|
self,
|
322
338
|
prompt: chat_prompt.ChatPrompt,
|
323
339
|
dataset: Dataset,
|
@@ -325,13 +341,13 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
325
341
|
experiment_config: Optional[Dict] = None,
|
326
342
|
n_samples: Optional[int] = None,
|
327
343
|
auto_continue: bool = False,
|
328
|
-
|
344
|
+
agent_class: Optional[Type[OptimizableAgent]] = None,
|
345
|
+
**kwargs: Any,
|
329
346
|
) -> OptimizationResult:
|
330
347
|
"""
|
331
348
|
Optimize a prompt using meta-reasoning.
|
332
349
|
|
333
350
|
Args:
|
334
|
-
prompt: The prompt to optimize
|
335
351
|
dataset: The dataset to evaluate against
|
336
352
|
metric: The metric to use for evaluation
|
337
353
|
experiment_config: A dictionary to log with the experiments
|
@@ -344,12 +360,24 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
344
360
|
"""
|
345
361
|
if not isinstance(prompt, chat_prompt.ChatPrompt):
|
346
362
|
raise ValueError("Prompt must be a ChatPrompt object")
|
347
|
-
|
363
|
+
|
348
364
|
if not isinstance(dataset, Dataset):
|
349
365
|
raise ValueError("Dataset must be a Dataset object")
|
350
|
-
|
351
|
-
if not
|
352
|
-
raise ValueError(
|
366
|
+
|
367
|
+
if not callable(metric):
|
368
|
+
raise ValueError(
|
369
|
+
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
370
|
+
)
|
371
|
+
|
372
|
+
if prompt.model is None:
|
373
|
+
prompt.model = self.model
|
374
|
+
if prompt.model_kwargs is None:
|
375
|
+
prompt.model_kwargs = self.model_kwargs
|
376
|
+
|
377
|
+
if agent_class is None:
|
378
|
+
self.agent_class = utils.create_litellm_agent_class(prompt)
|
379
|
+
else:
|
380
|
+
self.agent_class = agent_class
|
353
381
|
|
354
382
|
total_items = len(dataset.get_items())
|
355
383
|
if n_samples is not None and n_samples > total_items:
|
@@ -358,12 +386,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
358
386
|
)
|
359
387
|
n_samples = None
|
360
388
|
|
361
|
-
|
362
389
|
optimization = None
|
363
390
|
try:
|
364
391
|
optimization = self._opik_client.create_optimization(
|
365
392
|
dataset_name=dataset.name,
|
366
|
-
objective_name=getattr(metric,
|
393
|
+
objective_name=getattr(metric, "__name__", str(metric)),
|
367
394
|
metadata={"optimizer": self.__class__.__name__},
|
368
395
|
)
|
369
396
|
logger.debug(f"Created optimization with ID: {optimization.id}")
|
@@ -377,21 +404,22 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
377
404
|
algorithm=self.__class__.__name__,
|
378
405
|
optimization_id=optimization.id if optimization is not None else None,
|
379
406
|
dataset_id=dataset.id,
|
380
|
-
verbose=self.verbose
|
407
|
+
verbose=self.verbose,
|
381
408
|
)
|
382
409
|
reporting.display_configuration(
|
383
|
-
messages=prompt.
|
410
|
+
messages=prompt.get_messages(),
|
384
411
|
optimizer_config={
|
385
412
|
"optimizer": self.__class__.__name__,
|
386
413
|
"n_samples": n_samples,
|
387
|
-
"auto_continue": auto_continue
|
414
|
+
"auto_continue": auto_continue,
|
388
415
|
},
|
389
|
-
verbose=self.verbose
|
416
|
+
verbose=self.verbose,
|
390
417
|
)
|
391
418
|
|
392
419
|
try:
|
420
|
+
optimization_id = optimization.id if optimization is not None else None
|
393
421
|
result = self._optimize_prompt(
|
394
|
-
optimization_id=
|
422
|
+
optimization_id=optimization_id,
|
395
423
|
prompt=prompt,
|
396
424
|
dataset=dataset,
|
397
425
|
metric=metric,
|
@@ -413,31 +441,33 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
413
441
|
|
414
442
|
def _optimize_prompt(
|
415
443
|
self,
|
416
|
-
optimization_id: str,
|
444
|
+
optimization_id: Optional[str],
|
417
445
|
prompt: chat_prompt.ChatPrompt,
|
418
446
|
dataset: Dataset,
|
419
447
|
metric: Callable,
|
420
448
|
experiment_config: Optional[Dict],
|
421
|
-
n_samples: int,
|
449
|
+
n_samples: Optional[int],
|
422
450
|
auto_continue: bool,
|
423
|
-
**kwargs,
|
451
|
+
**kwargs: Any,
|
424
452
|
) -> OptimizationResult:
|
425
453
|
self.auto_continue = auto_continue
|
426
454
|
self.dataset = dataset
|
427
455
|
self.prompt = prompt
|
428
|
-
self.llm_call_counter = 0
|
429
|
-
initial_prompt
|
456
|
+
self.llm_call_counter = 0 # Reset counter for run
|
457
|
+
initial_prompt = prompt
|
430
458
|
|
431
|
-
current_prompt = prompt
|
459
|
+
current_prompt = prompt
|
432
460
|
experiment_config = experiment_config or {}
|
433
461
|
experiment_config = {
|
434
462
|
**experiment_config,
|
435
463
|
**{
|
436
464
|
"optimizer": self.__class__.__name__,
|
437
|
-
"
|
438
|
-
"
|
465
|
+
"agent_class": self.agent_class.__name__,
|
466
|
+
"agent_config": prompt.to_dict(),
|
467
|
+
"metric": getattr(metric, "__name__", str(metric)),
|
468
|
+
"dataset": dataset.name,
|
439
469
|
"configuration": {
|
440
|
-
"prompt":
|
470
|
+
"prompt": prompt.get_messages(),
|
441
471
|
"rounds": self.rounds,
|
442
472
|
"num_prompts_per_round": self.num_prompts_per_round,
|
443
473
|
},
|
@@ -445,8 +475,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
445
475
|
}
|
446
476
|
|
447
477
|
with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
|
448
|
-
initial_score = self.
|
449
|
-
prompt
|
478
|
+
initial_score = self._evaluate_prompt(
|
479
|
+
prompt,
|
450
480
|
optimization_id=optimization_id,
|
451
481
|
dataset=dataset,
|
452
482
|
metric=metric,
|
@@ -457,20 +487,22 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
457
487
|
)
|
458
488
|
best_score = initial_score
|
459
489
|
best_prompt = current_prompt
|
460
|
-
rounds = []
|
490
|
+
rounds: List[OptimizationRound] = []
|
461
491
|
|
462
492
|
baseline_reporter.set_score(initial_score)
|
463
493
|
|
464
494
|
reporting.display_optimization_start_message(verbose=self.verbose)
|
465
|
-
with reporting.display_round_progress(
|
495
|
+
with reporting.display_round_progress(
|
496
|
+
self.rounds, verbose=self.verbose
|
497
|
+
) as round_reporter:
|
466
498
|
for round_num in range(self.rounds):
|
467
|
-
|
468
499
|
round_reporter.round_start(round_num)
|
469
500
|
previous_best_score = best_score
|
470
|
-
|
501
|
+
|
471
502
|
# Step 1. Create a set of candidate prompts
|
472
503
|
try:
|
473
504
|
candidate_prompts = self._generate_candidate_prompts(
|
505
|
+
project_name=self.agent_class.project_name,
|
474
506
|
current_prompt=best_prompt,
|
475
507
|
best_score=best_score,
|
476
508
|
round_num=round_num,
|
@@ -483,14 +515,19 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
483
515
|
continue
|
484
516
|
|
485
517
|
# Step 2. Score each candidate prompt
|
486
|
-
prompt_scores = []
|
518
|
+
prompt_scores: List[Tuple[chat_prompt.ChatPrompt, float]] = []
|
487
519
|
for candidate_count, prompt in enumerate(candidate_prompts):
|
488
|
-
with reporting.display_prompt_candidate_scoring_report(
|
520
|
+
with reporting.display_prompt_candidate_scoring_report(
|
521
|
+
verbose=self.verbose
|
522
|
+
) as eval_report:
|
489
523
|
eval_report.set_generated_prompts(candidate_count, prompt)
|
490
524
|
|
525
|
+
new_prompt = current_prompt.copy()
|
526
|
+
new_prompt.set_messages(prompt.get_messages())
|
527
|
+
|
491
528
|
try:
|
492
|
-
prompt_score = self.
|
493
|
-
prompt=
|
529
|
+
prompt_score = self._evaluate_prompt(
|
530
|
+
prompt=new_prompt,
|
494
531
|
optimization_id=optimization_id,
|
495
532
|
dataset=dataset,
|
496
533
|
metric=metric,
|
@@ -501,34 +538,37 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
501
538
|
)
|
502
539
|
|
503
540
|
eval_report.set_final_score(best_score, prompt_score)
|
504
|
-
except Exception
|
505
|
-
|
541
|
+
except Exception:
|
542
|
+
print("Failed evaluating agent; continuing...")
|
543
|
+
prompt_score = 0
|
506
544
|
|
507
545
|
prompt_scores.append((prompt, prompt_score))
|
508
|
-
|
546
|
+
|
509
547
|
# Step 3. Identify potential improvements
|
510
548
|
if not prompt_scores:
|
511
|
-
logger.warning(
|
549
|
+
logger.warning(
|
550
|
+
"No prompts were successfully evaluated in this round"
|
551
|
+
)
|
512
552
|
break
|
513
553
|
|
514
554
|
prompt_scores.sort(key=lambda x: x[1], reverse=True)
|
515
|
-
best_candidate_this_round, best_cand_score_avg =
|
516
|
-
|
555
|
+
best_candidate_this_round, best_cand_score_avg = prompt_scores[0]
|
556
|
+
improvement = self._calculate_improvement(
|
557
|
+
best_cand_score_avg, best_score
|
517
558
|
)
|
518
|
-
|
519
|
-
|
520
|
-
|
559
|
+
round_reporter.round_end(round_num, best_cand_score_avg, best_score)
|
560
|
+
|
521
561
|
round_data = self._create_round_data(
|
522
562
|
round_num=round_num,
|
523
|
-
current_best_prompt=
|
524
|
-
current_best_score=
|
525
|
-
best_prompt_overall=
|
563
|
+
current_best_prompt=best_prompt,
|
564
|
+
current_best_score=best_score,
|
565
|
+
best_prompt_overall=best_prompt,
|
526
566
|
evaluated_candidates=prompt_scores,
|
527
567
|
previous_best_score=previous_best_score,
|
528
568
|
improvement_this_round=improvement,
|
529
569
|
)
|
530
570
|
rounds.append(round_data)
|
531
|
-
self._add_to_history(round_data
|
571
|
+
self._add_to_history(round_data)
|
532
572
|
|
533
573
|
if improvement > 0:
|
534
574
|
best_score = best_cand_score_avg
|
@@ -537,17 +577,21 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
537
577
|
reporting.display_result(
|
538
578
|
initial_score,
|
539
579
|
best_score,
|
540
|
-
best_prompt,
|
541
|
-
verbose=self.verbose
|
580
|
+
best_prompt.get_messages() if best_prompt is not None else [],
|
581
|
+
verbose=self.verbose,
|
542
582
|
)
|
543
583
|
|
544
584
|
return self._create_result(
|
545
585
|
metric,
|
546
|
-
initial_prompt=initial_prompt
|
547
|
-
|
586
|
+
initial_prompt=initial_prompt.get_messages()
|
587
|
+
if initial_prompt is not None
|
588
|
+
else [],
|
589
|
+
best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
|
548
590
|
best_score=best_score,
|
549
591
|
initial_score=initial_score,
|
550
592
|
rounds=rounds,
|
593
|
+
dataset_id=dataset.id,
|
594
|
+
optimization_id=optimization_id,
|
551
595
|
)
|
552
596
|
|
553
597
|
def _calculate_improvement(
|
@@ -566,7 +610,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
566
610
|
current_best_prompt: chat_prompt.ChatPrompt,
|
567
611
|
current_best_score: float,
|
568
612
|
best_prompt_overall: chat_prompt.ChatPrompt,
|
569
|
-
evaluated_candidates: List[
|
613
|
+
evaluated_candidates: List[Tuple[chat_prompt.ChatPrompt, float]],
|
570
614
|
previous_best_score: float,
|
571
615
|
improvement_this_round: float,
|
572
616
|
) -> OptimizationRound:
|
@@ -578,7 +622,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
578
622
|
)
|
579
623
|
generated_prompts_log.append(
|
580
624
|
{
|
581
|
-
"prompt": prompt,
|
625
|
+
"prompt": prompt.get_messages(),
|
582
626
|
"score": score,
|
583
627
|
"improvement": improvement_vs_prev,
|
584
628
|
}
|
@@ -602,6 +646,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
602
646
|
best_score: float,
|
603
647
|
initial_score: float,
|
604
648
|
rounds: List[OptimizationRound],
|
649
|
+
dataset_id: Optional[str],
|
650
|
+
optimization_id: Optional[str],
|
605
651
|
) -> OptimizationResult:
|
606
652
|
"""Create the final OptimizationResult object."""
|
607
653
|
details = {
|
@@ -609,7 +655,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
609
655
|
"final_score": best_score,
|
610
656
|
"rounds": rounds,
|
611
657
|
"total_rounds": len(rounds),
|
612
|
-
"metric_name": getattr(metric,
|
658
|
+
"metric_name": getattr(metric, "__name__", str(metric)),
|
613
659
|
"model": self.model,
|
614
660
|
"temperature": self.model_kwargs.get("temperature"),
|
615
661
|
}
|
@@ -620,9 +666,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
620
666
|
score=best_score,
|
621
667
|
initial_prompt=initial_prompt,
|
622
668
|
initial_score=initial_score,
|
623
|
-
metric_name=getattr(metric,
|
669
|
+
metric_name=getattr(metric, "__name__", str(metric)),
|
624
670
|
details=details,
|
625
|
-
llm_calls=self.llm_call_counter
|
671
|
+
llm_calls=self.llm_call_counter,
|
672
|
+
dataset_id=dataset_id,
|
673
|
+
optimization_id=optimization_id,
|
626
674
|
)
|
627
675
|
|
628
676
|
def _get_task_context(self, metric: Callable) -> str:
|
@@ -648,7 +696,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
648
696
|
context += f"Dataset fields (includes both input and optionally the expected output): {', '.join([x for x in sample.keys() if x != 'id'])}\n"
|
649
697
|
context += f"Evaluation Metric:\n{metrics_str}\n"
|
650
698
|
context += f"\nExample:\n{json.dumps(sample)}\n"
|
651
|
-
|
699
|
+
|
652
700
|
return context
|
653
701
|
|
654
702
|
def _generate_candidate_prompts(
|
@@ -659,14 +707,14 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
659
707
|
previous_rounds: List[OptimizationRound],
|
660
708
|
metric: Callable,
|
661
709
|
optimization_id: Optional[str] = None,
|
662
|
-
|
710
|
+
project_name: Optional[str] = None,
|
711
|
+
) -> List[chat_prompt.ChatPrompt]:
|
663
712
|
"""Generate candidate prompts using meta-prompting."""
|
664
713
|
with reporting.display_candidate_generation_report(
|
665
|
-
self.num_prompts_per_round,
|
666
|
-
|
667
|
-
) as candidate_generation_report:
|
714
|
+
self.num_prompts_per_round, verbose=self.verbose
|
715
|
+
) as candidate_generation_report:
|
668
716
|
logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
|
669
|
-
logger.debug(f"Generating from prompt: {current_prompt}")
|
717
|
+
logger.debug(f"Generating from prompt: {current_prompt.get_messages()}")
|
670
718
|
logger.debug(f"Current best score: {best_score:.4f}")
|
671
719
|
|
672
720
|
history_context = self._build_history_context(previous_rounds)
|
@@ -678,16 +726,22 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
678
726
|
if self.enable_context:
|
679
727
|
task_context_str = self._get_task_context(metric=metric)
|
680
728
|
analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
|
681
|
-
metric_focus_instruction =
|
729
|
+
metric_focus_instruction = (
|
730
|
+
f"Focus on improving the score for the metric: {metric.__name__}."
|
731
|
+
)
|
682
732
|
improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
|
683
|
-
logger.debug(
|
733
|
+
logger.debug(
|
734
|
+
"Task context and metric-specific instructions enabled for reasoning prompt."
|
735
|
+
)
|
684
736
|
else:
|
685
|
-
analysis_instruction = "Analyze the history of scores and the current prompt
|
737
|
+
analysis_instruction = "Analyze the history of scores and the current prompt's performance."
|
686
738
|
metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
|
687
739
|
improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
|
688
|
-
logger.debug(
|
740
|
+
logger.debug(
|
741
|
+
"Task context and metric-specific instructions disabled for reasoning prompt."
|
742
|
+
)
|
689
743
|
|
690
|
-
user_prompt = f"""Current prompt: {current_prompt}
|
744
|
+
user_prompt = f"""Current prompt: {current_prompt.get_messages()}
|
691
745
|
Current score: {best_score}
|
692
746
|
{history_context}
|
693
747
|
{task_context_str}
|
@@ -707,9 +761,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
707
761
|
try:
|
708
762
|
# Use _call_model which handles selecting reasoning_model
|
709
763
|
content = self._call_model(
|
764
|
+
project_name,
|
710
765
|
messages=[
|
711
766
|
{"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
|
712
|
-
{"role": "user", "content": user_prompt}
|
767
|
+
{"role": "user", "content": user_prompt},
|
713
768
|
],
|
714
769
|
is_reasoning=True,
|
715
770
|
optimization_id=optimization_id,
|
@@ -729,9 +784,13 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
729
784
|
try:
|
730
785
|
json_result = json.loads(json_match.group())
|
731
786
|
except json.JSONDecodeError as e:
|
732
|
-
raise ValueError(
|
787
|
+
raise ValueError(
|
788
|
+
f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}"
|
789
|
+
)
|
733
790
|
else:
|
734
|
-
raise ValueError(
|
791
|
+
raise ValueError(
|
792
|
+
f"No JSON object found in response via regex. - received: {content}"
|
793
|
+
)
|
735
794
|
|
736
795
|
# Validate the parsed JSON structure
|
737
796
|
if isinstance(json_result, list) and len(json_result) == 1:
|
@@ -739,27 +798,46 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
739
798
|
|
740
799
|
if not isinstance(json_result, dict) or "prompts" not in json_result:
|
741
800
|
logger.debug(f"Parsed JSON content: {json_result}")
|
742
|
-
raise ValueError(
|
801
|
+
raise ValueError(
|
802
|
+
f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}"
|
803
|
+
)
|
743
804
|
|
744
805
|
if not isinstance(json_result["prompts"], list):
|
745
806
|
logger.debug(f"Content of 'prompts': {json_result.get('prompts')}")
|
746
|
-
raise ValueError(
|
807
|
+
raise ValueError(
|
808
|
+
f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}"
|
809
|
+
)
|
747
810
|
|
748
811
|
# Extract and log valid prompts
|
749
|
-
valid_prompts = []
|
812
|
+
valid_prompts: List[chat_prompt.ChatPrompt] = []
|
750
813
|
for item in json_result["prompts"]:
|
751
814
|
if (
|
752
815
|
isinstance(item, dict)
|
753
816
|
and "prompt" in item
|
754
817
|
and isinstance(item["prompt"], list)
|
755
818
|
):
|
756
|
-
|
757
|
-
|
758
|
-
|
819
|
+
# NOTE: might be brittle
|
820
|
+
if current_prompt.user:
|
821
|
+
user_text = current_prompt.user
|
822
|
+
else:
|
823
|
+
if current_prompt.messages is not None:
|
824
|
+
user_text = current_prompt.messages[-1]["content"]
|
825
|
+
else:
|
826
|
+
raise Exception(
|
827
|
+
"User content not found in chat-prompt!"
|
828
|
+
)
|
829
|
+
|
830
|
+
valid_prompts.append(
|
831
|
+
chat_prompt.ChatPrompt(
|
832
|
+
system=item["prompt"][0]["content"],
|
833
|
+
user=user_text,
|
834
|
+
)
|
835
|
+
)
|
836
|
+
|
759
837
|
# Log details
|
760
838
|
focus = item.get("improvement_focus", "N/A")
|
761
839
|
reasoning = item.get("reasoning", "N/A")
|
762
|
-
logger.debug(f"Generated prompt: {
|
840
|
+
logger.debug(f"Generated prompt: {item['prompt']}")
|
763
841
|
logger.debug(f" Improvement focus: {focus}")
|
764
842
|
logger.debug(f" Reasoning: {reasoning}")
|
765
843
|
else:
|
@@ -768,17 +846,19 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
768
846
|
)
|
769
847
|
|
770
848
|
if not valid_prompts:
|
771
|
-
raise ValueError(
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
)
|
776
|
-
|
849
|
+
raise ValueError(
|
850
|
+
"No valid prompts found in the parsed JSON response after validation."
|
851
|
+
)
|
852
|
+
|
853
|
+
candidate_generation_report.set_generated_prompts()
|
854
|
+
|
777
855
|
return valid_prompts
|
778
856
|
# --- End Robust Parsing ---
|
779
857
|
|
780
858
|
except Exception as e:
|
781
|
-
raise ValueError(
|
859
|
+
raise ValueError(
|
860
|
+
f"Unexpected error during candidate prompt generation: {e}"
|
861
|
+
)
|
782
862
|
|
783
863
|
def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
|
784
864
|
"""Build context from previous optimization rounds."""
|