opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -3
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +41 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +28 -20
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +96 -46
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +122 -37
  40. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.0.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.1.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- from typing import Any, Callable, Dict, List, Optional, overload
4
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
5
5
 
6
6
  import litellm
7
7
  import opik
@@ -13,11 +13,13 @@ from opik.environment import get_tqdm_for_current_environment
13
13
  from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
14
14
 
15
15
  from opik_optimizer import task_evaluator
16
+ from opik_optimizer import utils
16
17
 
17
18
  from .. import _throttle
18
19
  from ..base_optimizer import BaseOptimizer, OptimizationRound
19
20
  from ..optimization_config import chat_prompt, mappers
20
21
  from ..optimization_result import OptimizationResult
22
+ from ..optimizable_agent import OptimizableAgent
21
23
  from . import reporting
22
24
 
23
25
  tqdm = get_tqdm_for_current_environment()
@@ -35,17 +37,18 @@ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
35
37
  class MetaPromptOptimizer(BaseOptimizer):
36
38
  """
37
39
  The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
38
-
40
+
39
41
  This algorithm is best used when you have a prompt and would like to make sure it follows best
40
42
  practices.
41
43
  """
44
+
42
45
  # --- Constants for Default Configuration ---
43
46
  DEFAULT_ROUNDS = 3
44
47
  DEFAULT_PROMPTS_PER_ROUND = 4
45
48
 
46
49
  # --- Reasoning System Prompt ---
47
50
  _REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
48
-
51
+
49
52
  Focus on making the prompt more effective by:
50
53
  1. Being clear and specific about what is expected
51
54
  2. Providing necessary context and constraints
@@ -83,11 +86,10 @@ class MetaPromptOptimizer(BaseOptimizer):
83
86
  rounds: int = DEFAULT_ROUNDS,
84
87
  num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
85
88
  num_threads: int = 12,
86
- project_name: str = "Optimization",
87
89
  verbose: int = 1,
88
90
  enable_context: bool = True,
89
- **model_kwargs,
90
- ):
91
+ **model_kwargs: Any,
92
+ ) -> None:
91
93
  """
92
94
  Args:
93
95
  model: The model to use for evaluation
@@ -95,18 +97,16 @@ class MetaPromptOptimizer(BaseOptimizer):
95
97
  rounds: Number of optimization rounds
96
98
  num_prompts_per_round: Number of prompts to generate per round
97
99
  num_threads: Number of threads for parallel evaluation
98
- project_name: Optional project name for tracking
99
100
  verbose: Controls internal logging/progress bars (0=off, 1=on).
100
101
  enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
101
102
  **model_kwargs: Additional model parameters
102
103
  """
103
- super().__init__(model=model, project_name=project_name, **model_kwargs)
104
+ super().__init__(model=model, verbose=verbose, **model_kwargs)
104
105
  self.reasoning_model = reasoning_model if reasoning_model is not None else model
105
106
  self.rounds = rounds
106
107
  self.num_prompts_per_round = num_prompts_per_round
107
108
  self.num_threads = num_threads
108
- self.verbose = verbose
109
- self.dataset = None
109
+ self.dataset: Optional[Dataset] = None
110
110
  self._opik_client = opik_client.get_client_cached()
111
111
  self.llm_call_counter = 0
112
112
  self.enable_context = enable_context
@@ -120,6 +120,7 @@ class MetaPromptOptimizer(BaseOptimizer):
120
120
  @_throttle.rate_limited(_rate_limiter)
121
121
  def _call_model(
122
122
  self,
123
+ project_name: str,
123
124
  messages: List[Dict[str, str]],
124
125
  is_reasoning: bool = False,
125
126
  optimization_id: Optional[str] = None,
@@ -133,12 +134,18 @@ class MetaPromptOptimizer(BaseOptimizer):
133
134
  base_max_tokens = getattr(self, "max_tokens", 1000)
134
135
 
135
136
  # Use potentially different settings for reasoning calls
136
- reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
137
+ reasoning_temperature = (
138
+ base_temperature # Keep same temp unless specified otherwise
139
+ )
137
140
  # Increase max_tokens for reasoning to ensure JSON fits, unless already high
138
- reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
141
+ reasoning_max_tokens = (
142
+ max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
143
+ )
139
144
 
140
145
  llm_config_params = {
141
- "temperature": reasoning_temperature if is_reasoning else base_temperature,
146
+ "temperature": (
147
+ reasoning_temperature if is_reasoning else base_temperature
148
+ ),
142
149
  "max_tokens": reasoning_max_tokens,
143
150
  "top_p": getattr(self, "top_p", 1.0),
144
151
  "frequency_penalty": getattr(self, "frequency_penalty", 0.0),
@@ -146,12 +153,12 @@ class MetaPromptOptimizer(BaseOptimizer):
146
153
  }
147
154
 
148
155
  # Prepare metadata that we want to be part of the LLM call context.
149
- metadata_for_opik = {}
150
- if self.project_name:
156
+ metadata_for_opik: Dict[str, Any] = {}
157
+ if project_name:
151
158
  metadata_for_opik["project_name"] = (
152
- self.project_name
153
- ) # Top-level for general use
154
- metadata_for_opik["opik"] = {"project_name": self.project_name}
159
+ project_name # Top-level for general use
160
+ )
161
+ metadata_for_opik["opik"] = {"project_name": project_name}
155
162
 
156
163
  if optimization_id:
157
164
  # Also add to opik-specific structure if project_name was added
@@ -181,10 +188,10 @@ class MetaPromptOptimizer(BaseOptimizer):
181
188
  )
182
189
 
183
190
  response = litellm.completion(
184
- model=model_to_use,
185
- messages=messages,
191
+ model=model_to_use,
192
+ messages=messages,
186
193
  num_retries=6,
187
- **final_call_params
194
+ **final_call_params,
188
195
  )
189
196
  return response.choices[0].message.content
190
197
  except litellm.exceptions.RateLimitError as e:
@@ -197,27 +204,26 @@ class MetaPromptOptimizer(BaseOptimizer):
197
204
  logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
198
205
  # Log prompt length if possible? Needs access to prompt_for_llm here.
199
206
  raise
200
- except Exception as e:
201
- logger.error(
202
- f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
203
- )
207
+ except Exception:
208
+ # logger.error(
209
+ # f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
210
+ # )
204
211
  raise
205
212
 
206
- # type: ignore
207
- def evaluate_prompt(
213
+ def _evaluate_prompt(
208
214
  self,
209
215
  prompt: chat_prompt.ChatPrompt,
210
216
  dataset: opik.Dataset,
211
217
  metric: Callable,
212
- use_full_dataset: bool = True,
213
- experiment_config: Optional[Dict] = None,
214
218
  n_samples: Optional[int] = None,
219
+ dataset_item_ids: Optional[List[str]] = None,
220
+ experiment_config: Optional[Dict] = None,
221
+ use_full_dataset: bool = True,
215
222
  optimization_id: Optional[str] = None,
216
- verbose: int = 1,
223
+ **kwargs: Any,
217
224
  ) -> float:
218
225
  """
219
226
  Args:
220
- prompt: The prompt to evaluate
221
227
  dataset: Opik Dataset to evaluate the prompt on
222
228
  metric: Metric functions
223
229
  use_full_dataset: Whether to use the full dataset or a subset
@@ -249,16 +255,18 @@ class MetaPromptOptimizer(BaseOptimizer):
249
255
  else:
250
256
  subset_size = None # Use all items for final checks
251
257
  logger.debug("Using full dataset for evaluation")
252
-
258
+
253
259
  experiment_config = experiment_config or {}
254
260
  experiment_config = {
255
261
  **experiment_config,
256
262
  **{
257
263
  "optimizer": self.__class__.__name__,
258
- "metric": metric.__name__,
264
+ "agent_class": self.agent_class.__name__,
265
+ "agent_config": prompt.to_dict(),
266
+ "metric": getattr(metric, "__name__", str(metric)),
259
267
  "dataset": dataset.name,
260
268
  "configuration": {
261
- "prompt": prompt.formatted_messages,
269
+ "prompt": prompt.get_messages(),
262
270
  "n_samples": subset_size,
263
271
  "use_full_dataset": use_full_dataset,
264
272
  },
@@ -269,25 +277,33 @@ class MetaPromptOptimizer(BaseOptimizer):
269
277
 
270
278
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
271
279
  # --- Step 1: Prepare the prompt for the LLM ---
272
- messages = [{
273
- "role": item["role"],
274
- "content": item["content"].format(**dataset_item)
275
- } for item in prompt.formatted_messages]
280
+ # messages = [
281
+ # {
282
+ # "role": item["role"],
283
+ # "content": item["content"].format(**dataset_item),
284
+ # }
285
+ # for item in prompt.get_messages()
286
+ # ]
287
+ # Step 1: create the agent
288
+ new_prompt = prompt.copy()
289
+ messages = new_prompt.get_messages(dataset_item)
290
+ new_prompt.set_messages(messages)
291
+ agent = self.agent_class(new_prompt)
276
292
 
277
293
  # --- Step 2: Call the model ---
278
294
  try:
279
- logger.debug(f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}")
280
- raw_model_output = self._call_model(
281
- messages=messages,
282
- is_reasoning=False,
283
- optimization_id=optimization_id,
295
+ logger.debug(
296
+ f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
284
297
  )
298
+ raw_model_output = agent.invoke(messages)
285
299
  logger.debug(f"LLM raw response length: {len(raw_model_output)}")
286
300
  logger.debug(f"LLM raw output: {raw_model_output}")
287
301
  except Exception as e:
288
302
  logger.error(f"Error calling model with prompt: {e}")
289
303
  logger.error(f"Failed prompt: {messages}")
290
- logger.error(f"Prompt length: {sum(len(msg['content']) for msg in messages)}")
304
+ logger.error(
305
+ f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
306
+ )
291
307
  raise
292
308
 
293
309
  # --- Step 3: Clean the model's output before metric evaluation ---
@@ -300,14 +316,15 @@ class MetaPromptOptimizer(BaseOptimizer):
300
316
 
301
317
  # Use dataset's get_items with limit for sampling
302
318
  logger.debug(
303
- f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {metric.__name__}"
319
+ f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {getattr(metric, '__name__', str(metric))}"
304
320
  )
305
321
  score = task_evaluator.evaluate(
306
322
  dataset=dataset,
307
323
  metric=metric,
308
324
  evaluated_task=llm_task,
325
+ dataset_item_ids=dataset_item_ids,
309
326
  num_threads=self.num_threads,
310
- project_name=self.project_name,
327
+ project_name=self.agent_class.project_name,
311
328
  n_samples=subset_size, # Use subset_size for trials, None for full dataset
312
329
  experiment_config=experiment_config,
313
330
  optimization_id=optimization_id,
@@ -316,7 +333,7 @@ class MetaPromptOptimizer(BaseOptimizer):
316
333
  logger.debug(f"Evaluation score: {score:.4f}")
317
334
  return score
318
335
 
319
- def optimize_prompt( # type: ignore[override]
336
+ def optimize_prompt(
320
337
  self,
321
338
  prompt: chat_prompt.ChatPrompt,
322
339
  dataset: Dataset,
@@ -324,13 +341,13 @@ class MetaPromptOptimizer(BaseOptimizer):
324
341
  experiment_config: Optional[Dict] = None,
325
342
  n_samples: Optional[int] = None,
326
343
  auto_continue: bool = False,
327
- **kwargs,
344
+ agent_class: Optional[Type[OptimizableAgent]] = None,
345
+ **kwargs: Any,
328
346
  ) -> OptimizationResult:
329
347
  """
330
348
  Optimize a prompt using meta-reasoning.
331
349
 
332
350
  Args:
333
- prompt: The prompt to optimize
334
351
  dataset: The dataset to evaluate against
335
352
  metric: The metric to use for evaluation
336
353
  experiment_config: A dictionary to log with the experiments
@@ -341,8 +358,27 @@ class MetaPromptOptimizer(BaseOptimizer):
341
358
  Returns:
342
359
  OptimizationResult: Structured result containing optimization details
343
360
  """
344
- reporting.display_header(self.__class__.__name__, verbose=self.verbose)
345
-
361
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
362
+ raise ValueError("Prompt must be a ChatPrompt object")
363
+
364
+ if not isinstance(dataset, Dataset):
365
+ raise ValueError("Dataset must be a Dataset object")
366
+
367
+ if not callable(metric):
368
+ raise ValueError(
369
+ "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
370
+ )
371
+
372
+ if prompt.model is None:
373
+ prompt.model = self.model
374
+ if prompt.model_kwargs is None:
375
+ prompt.model_kwargs = self.model_kwargs
376
+
377
+ if agent_class is None:
378
+ self.agent_class = utils.create_litellm_agent_class(prompt)
379
+ else:
380
+ self.agent_class = agent_class
381
+
346
382
  total_items = len(dataset.get_items())
347
383
  if n_samples is not None and n_samples > total_items:
348
384
  logger.warning(
@@ -350,21 +386,11 @@ class MetaPromptOptimizer(BaseOptimizer):
350
386
  )
351
387
  n_samples = None
352
388
 
353
- reporting.display_configuration(
354
- messages=prompt.formatted_messages,
355
- optimizer_config={
356
- "optimizer": self.__class__.__name__,
357
- "n_samples": n_samples,
358
- "auto_continue": auto_continue
359
- },
360
- verbose=self.verbose
361
- )
362
-
363
389
  optimization = None
364
390
  try:
365
391
  optimization = self._opik_client.create_optimization(
366
392
  dataset_name=dataset.name,
367
- objective_name=metric.__name__,
393
+ objective_name=getattr(metric, "__name__", str(metric)),
368
394
  metadata={"optimizer": self.__class__.__name__},
369
395
  )
370
396
  logger.debug(f"Created optimization with ID: {optimization.id}")
@@ -374,9 +400,26 @@ class MetaPromptOptimizer(BaseOptimizer):
374
400
  )
375
401
  optimization = None
376
402
 
403
+ reporting.display_header(
404
+ algorithm=self.__class__.__name__,
405
+ optimization_id=optimization.id if optimization is not None else None,
406
+ dataset_id=dataset.id,
407
+ verbose=self.verbose,
408
+ )
409
+ reporting.display_configuration(
410
+ messages=prompt.get_messages(),
411
+ optimizer_config={
412
+ "optimizer": self.__class__.__name__,
413
+ "n_samples": n_samples,
414
+ "auto_continue": auto_continue,
415
+ },
416
+ verbose=self.verbose,
417
+ )
418
+
377
419
  try:
420
+ optimization_id = optimization.id if optimization is not None else None
378
421
  result = self._optimize_prompt(
379
- optimization_id=optimization.id if optimization is not None else None,
422
+ optimization_id=optimization_id,
380
423
  prompt=prompt,
381
424
  dataset=dataset,
382
425
  metric=metric,
@@ -398,30 +441,33 @@ class MetaPromptOptimizer(BaseOptimizer):
398
441
 
399
442
  def _optimize_prompt(
400
443
  self,
401
- optimization_id: str,
444
+ optimization_id: Optional[str],
402
445
  prompt: chat_prompt.ChatPrompt,
403
446
  dataset: Dataset,
404
447
  metric: Callable,
405
448
  experiment_config: Optional[Dict],
406
- n_samples: int,
449
+ n_samples: Optional[int],
407
450
  auto_continue: bool,
408
- **kwargs,
451
+ **kwargs: Any,
409
452
  ) -> OptimizationResult:
410
453
  self.auto_continue = auto_continue
411
454
  self.dataset = dataset
412
455
  self.prompt = prompt
413
- self.llm_call_counter = 0 # Reset counter for run
456
+ self.llm_call_counter = 0 # Reset counter for run
457
+ initial_prompt = prompt
414
458
 
415
- current_prompt = prompt.formatted_messages
459
+ current_prompt = prompt
416
460
  experiment_config = experiment_config or {}
417
461
  experiment_config = {
418
462
  **experiment_config,
419
463
  **{
420
464
  "optimizer": self.__class__.__name__,
421
- "metric": metric.__name__,
422
- "dataset": self.dataset.name,
465
+ "agent_class": self.agent_class.__name__,
466
+ "agent_config": prompt.to_dict(),
467
+ "metric": getattr(metric, "__name__", str(metric)),
468
+ "dataset": dataset.name,
423
469
  "configuration": {
424
- "prompt": current_prompt,
470
+ "prompt": prompt.get_messages(),
425
471
  "rounds": self.rounds,
426
472
  "num_prompts_per_round": self.num_prompts_per_round,
427
473
  },
@@ -429,8 +475,8 @@ class MetaPromptOptimizer(BaseOptimizer):
429
475
  }
430
476
 
431
477
  with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
432
- initial_score = self.evaluate_prompt(
433
- prompt=prompt,
478
+ initial_score = self._evaluate_prompt(
479
+ prompt,
434
480
  optimization_id=optimization_id,
435
481
  dataset=dataset,
436
482
  metric=metric,
@@ -441,20 +487,22 @@ class MetaPromptOptimizer(BaseOptimizer):
441
487
  )
442
488
  best_score = initial_score
443
489
  best_prompt = current_prompt
444
- rounds = []
490
+ rounds: List[OptimizationRound] = []
445
491
 
446
492
  baseline_reporter.set_score(initial_score)
447
493
 
448
494
  reporting.display_optimization_start_message(verbose=self.verbose)
449
- with reporting.display_round_progress(self.rounds, verbose=self.verbose) as round_reporter:
495
+ with reporting.display_round_progress(
496
+ self.rounds, verbose=self.verbose
497
+ ) as round_reporter:
450
498
  for round_num in range(self.rounds):
451
-
452
499
  round_reporter.round_start(round_num)
453
500
  previous_best_score = best_score
454
-
501
+
455
502
  # Step 1. Create a set of candidate prompts
456
503
  try:
457
504
  candidate_prompts = self._generate_candidate_prompts(
505
+ project_name=self.agent_class.project_name,
458
506
  current_prompt=best_prompt,
459
507
  best_score=best_score,
460
508
  round_num=round_num,
@@ -467,14 +515,19 @@ class MetaPromptOptimizer(BaseOptimizer):
467
515
  continue
468
516
 
469
517
  # Step 2. Score each candidate prompt
470
- prompt_scores = []
518
+ prompt_scores: List[Tuple[chat_prompt.ChatPrompt, float]] = []
471
519
  for candidate_count, prompt in enumerate(candidate_prompts):
472
- with reporting.display_prompt_candidate_scoring_report(candidate_count, prompt, verbose=self.verbose) as eval_report:
520
+ with reporting.display_prompt_candidate_scoring_report(
521
+ verbose=self.verbose
522
+ ) as eval_report:
473
523
  eval_report.set_generated_prompts(candidate_count, prompt)
474
524
 
525
+ new_prompt = current_prompt.copy()
526
+ new_prompt.set_messages(prompt.get_messages())
527
+
475
528
  try:
476
- prompt_score = self.evaluate_prompt(
477
- prompt=chat_prompt.ChatPrompt(messages=prompt),
529
+ prompt_score = self._evaluate_prompt(
530
+ prompt=new_prompt,
478
531
  optimization_id=optimization_id,
479
532
  dataset=dataset,
480
533
  metric=metric,
@@ -485,34 +538,37 @@ class MetaPromptOptimizer(BaseOptimizer):
485
538
  )
486
539
 
487
540
  eval_report.set_final_score(best_score, prompt_score)
488
- except Exception as e:
489
- raise ValueError(f"Error evaluating candidate prompt: {e}")
541
+ except Exception:
542
+ print("Failed evaluating agent; continuing...")
543
+ prompt_score = 0
490
544
 
491
545
  prompt_scores.append((prompt, prompt_score))
492
-
546
+
493
547
  # Step 3. Identify potential improvements
494
548
  if not prompt_scores:
495
- logger.warning("No prompts were successfully evaluated in this round")
549
+ logger.warning(
550
+ "No prompts were successfully evaluated in this round"
551
+ )
496
552
  break
497
553
 
498
554
  prompt_scores.sort(key=lambda x: x[1], reverse=True)
499
- best_candidate_this_round, best_cand_score_avg = (
500
- prompt_scores[0]
555
+ best_candidate_this_round, best_cand_score_avg = prompt_scores[0]
556
+ improvement = self._calculate_improvement(
557
+ best_cand_score_avg, best_score
501
558
  )
502
- improvement = self._calculate_improvement(best_cand_score_avg, best_score)
503
- round_reporter.round_end(round_num, best_cand_score_avg, best_score, best_prompt)
504
-
559
+ round_reporter.round_end(round_num, best_cand_score_avg, best_score)
560
+
505
561
  round_data = self._create_round_data(
506
562
  round_num=round_num,
507
- current_best_prompt=chat_prompt.ChatPrompt(messages=best_candidate_this_round),
508
- current_best_score=best_cand_score_avg,
509
- best_prompt_overall=chat_prompt.ChatPrompt(messages=best_prompt),
563
+ current_best_prompt=best_prompt,
564
+ current_best_score=best_score,
565
+ best_prompt_overall=best_prompt,
510
566
  evaluated_candidates=prompt_scores,
511
567
  previous_best_score=previous_best_score,
512
568
  improvement_this_round=improvement,
513
569
  )
514
570
  rounds.append(round_data)
515
- self._add_to_history(round_data.model_dump())
571
+ self._add_to_history(round_data)
516
572
 
517
573
  if improvement > 0:
518
574
  best_score = best_cand_score_avg
@@ -521,17 +577,21 @@ class MetaPromptOptimizer(BaseOptimizer):
521
577
  reporting.display_result(
522
578
  initial_score,
523
579
  best_score,
524
- best_prompt,
525
- verbose=self.verbose
580
+ best_prompt.get_messages() if best_prompt is not None else [],
581
+ verbose=self.verbose,
526
582
  )
527
583
 
528
584
  return self._create_result(
529
585
  metric,
530
- prompt,
531
- best_prompt,
532
- best_score,
533
- initial_score,
534
- rounds,
586
+ initial_prompt=initial_prompt.get_messages()
587
+ if initial_prompt is not None
588
+ else [],
589
+ best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
590
+ best_score=best_score,
591
+ initial_score=initial_score,
592
+ rounds=rounds,
593
+ dataset_id=dataset.id,
594
+ optimization_id=optimization_id,
535
595
  )
536
596
 
537
597
  def _calculate_improvement(
@@ -550,7 +610,7 @@ class MetaPromptOptimizer(BaseOptimizer):
550
610
  current_best_prompt: chat_prompt.ChatPrompt,
551
611
  current_best_score: float,
552
612
  best_prompt_overall: chat_prompt.ChatPrompt,
553
- evaluated_candidates: List[tuple[str, float, List[float]]],
613
+ evaluated_candidates: List[Tuple[chat_prompt.ChatPrompt, float]],
554
614
  previous_best_score: float,
555
615
  improvement_this_round: float,
556
616
  ) -> OptimizationRound:
@@ -562,7 +622,7 @@ class MetaPromptOptimizer(BaseOptimizer):
562
622
  )
563
623
  generated_prompts_log.append(
564
624
  {
565
- "prompt": prompt,
625
+ "prompt": prompt.get_messages(),
566
626
  "score": score,
567
627
  "improvement": improvement_vs_prev,
568
628
  }
@@ -581,21 +641,21 @@ class MetaPromptOptimizer(BaseOptimizer):
581
641
  def _create_result(
582
642
  self,
583
643
  metric: Callable,
584
- prompt: chat_prompt.ChatPrompt,
585
- best_prompt: str,
644
+ initial_prompt: List[Dict[str, str]],
645
+ best_prompt: List[Dict[str, str]],
586
646
  best_score: float,
587
647
  initial_score: float,
588
648
  rounds: List[OptimizationRound],
649
+ dataset_id: Optional[str],
650
+ optimization_id: Optional[str],
589
651
  ) -> OptimizationResult:
590
652
  """Create the final OptimizationResult object."""
591
653
  details = {
592
- "initial_prompt": prompt,
593
- "initial_score": initial_score,
594
654
  "final_prompt": best_prompt,
595
655
  "final_score": best_score,
596
656
  "rounds": rounds,
597
657
  "total_rounds": len(rounds),
598
- "metric_name": metric.__name__,
658
+ "metric_name": getattr(metric, "__name__", str(metric)),
599
659
  "model": self.model,
600
660
  "temperature": self.model_kwargs.get("temperature"),
601
661
  }
@@ -604,9 +664,13 @@ class MetaPromptOptimizer(BaseOptimizer):
604
664
  optimizer=self.__class__.__name__,
605
665
  prompt=best_prompt,
606
666
  score=best_score,
607
- metric_name=metric.__name__,
667
+ initial_prompt=initial_prompt,
668
+ initial_score=initial_score,
669
+ metric_name=getattr(metric, "__name__", str(metric)),
608
670
  details=details,
609
- llm_calls=self.llm_call_counter
671
+ llm_calls=self.llm_call_counter,
672
+ dataset_id=dataset_id,
673
+ optimization_id=optimization_id,
610
674
  )
611
675
 
612
676
  def _get_task_context(self, metric: Callable) -> str:
@@ -632,7 +696,7 @@ class MetaPromptOptimizer(BaseOptimizer):
632
696
  context += f"Dataset fields (includes both input and optionally the expected output): {', '.join([x for x in sample.keys() if x != 'id'])}\n"
633
697
  context += f"Evaluation Metric:\n{metrics_str}\n"
634
698
  context += f"\nExample:\n{json.dumps(sample)}\n"
635
-
699
+
636
700
  return context
637
701
 
638
702
  def _generate_candidate_prompts(
@@ -643,14 +707,14 @@ class MetaPromptOptimizer(BaseOptimizer):
643
707
  previous_rounds: List[OptimizationRound],
644
708
  metric: Callable,
645
709
  optimization_id: Optional[str] = None,
646
- ) -> List[str]:
710
+ project_name: Optional[str] = None,
711
+ ) -> List[chat_prompt.ChatPrompt]:
647
712
  """Generate candidate prompts using meta-prompting."""
648
713
  with reporting.display_candidate_generation_report(
649
- self.num_prompts_per_round,
650
- verbose=self.verbose
651
- ) as candidate_generation_report:
714
+ self.num_prompts_per_round, verbose=self.verbose
715
+ ) as candidate_generation_report:
652
716
  logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
653
- logger.debug(f"Generating from prompt: {current_prompt}")
717
+ logger.debug(f"Generating from prompt: {current_prompt.get_messages()}")
654
718
  logger.debug(f"Current best score: {best_score:.4f}")
655
719
 
656
720
  history_context = self._build_history_context(previous_rounds)
@@ -662,16 +726,22 @@ class MetaPromptOptimizer(BaseOptimizer):
662
726
  if self.enable_context:
663
727
  task_context_str = self._get_task_context(metric=metric)
664
728
  analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
665
- metric_focus_instruction = f"Focus on improving the score for the metric: {metric.__name__}."
729
+ metric_focus_instruction = (
730
+ f"Focus on improving the score for the metric: {metric.__name__}."
731
+ )
666
732
  improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
667
- logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
733
+ logger.debug(
734
+ "Task context and metric-specific instructions enabled for reasoning prompt."
735
+ )
668
736
  else:
669
- analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
737
+ analysis_instruction = "Analyze the history of scores and the current prompt's performance."
670
738
  metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
671
739
  improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
672
- logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
740
+ logger.debug(
741
+ "Task context and metric-specific instructions disabled for reasoning prompt."
742
+ )
673
743
 
674
- user_prompt = f"""Current prompt: {current_prompt}
744
+ user_prompt = f"""Current prompt: {current_prompt.get_messages()}
675
745
  Current score: {best_score}
676
746
  {history_context}
677
747
  {task_context_str}
@@ -691,9 +761,10 @@ class MetaPromptOptimizer(BaseOptimizer):
691
761
  try:
692
762
  # Use _call_model which handles selecting reasoning_model
693
763
  content = self._call_model(
764
+ project_name,
694
765
  messages=[
695
766
  {"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
696
- {"role": "user", "content": user_prompt}
767
+ {"role": "user", "content": user_prompt},
697
768
  ],
698
769
  is_reasoning=True,
699
770
  optimization_id=optimization_id,
@@ -713,9 +784,13 @@ class MetaPromptOptimizer(BaseOptimizer):
713
784
  try:
714
785
  json_result = json.loads(json_match.group())
715
786
  except json.JSONDecodeError as e:
716
- raise ValueError(f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}")
787
+ raise ValueError(
788
+ f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}"
789
+ )
717
790
  else:
718
- raise ValueError(f"No JSON object found in response via regex. - received: {content}")
791
+ raise ValueError(
792
+ f"No JSON object found in response via regex. - received: {content}"
793
+ )
719
794
 
720
795
  # Validate the parsed JSON structure
721
796
  if isinstance(json_result, list) and len(json_result) == 1:
@@ -723,27 +798,46 @@ class MetaPromptOptimizer(BaseOptimizer):
723
798
 
724
799
  if not isinstance(json_result, dict) or "prompts" not in json_result:
725
800
  logger.debug(f"Parsed JSON content: {json_result}")
726
- raise ValueError(f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}")
801
+ raise ValueError(
802
+ f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}"
803
+ )
727
804
 
728
805
  if not isinstance(json_result["prompts"], list):
729
806
  logger.debug(f"Content of 'prompts': {json_result.get('prompts')}")
730
- raise ValueError(f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}")
807
+ raise ValueError(
808
+ f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}"
809
+ )
731
810
 
732
811
  # Extract and log valid prompts
733
- valid_prompts = []
812
+ valid_prompts: List[chat_prompt.ChatPrompt] = []
734
813
  for item in json_result["prompts"]:
735
814
  if (
736
815
  isinstance(item, dict)
737
816
  and "prompt" in item
738
817
  and isinstance(item["prompt"], list)
739
818
  ):
740
- prompt_text = item["prompt"]
741
- valid_prompts.append(prompt_text)
742
-
819
+ # NOTE: might be brittle
820
+ if current_prompt.user:
821
+ user_text = current_prompt.user
822
+ else:
823
+ if current_prompt.messages is not None:
824
+ user_text = current_prompt.messages[-1]["content"]
825
+ else:
826
+ raise Exception(
827
+ "User content not found in chat-prompt!"
828
+ )
829
+
830
+ valid_prompts.append(
831
+ chat_prompt.ChatPrompt(
832
+ system=item["prompt"][0]["content"],
833
+ user=user_text,
834
+ )
835
+ )
836
+
743
837
  # Log details
744
838
  focus = item.get("improvement_focus", "N/A")
745
839
  reasoning = item.get("reasoning", "N/A")
746
- logger.debug(f"Generated prompt: {prompt_text}")
840
+ logger.debug(f"Generated prompt: {item['prompt']}")
747
841
  logger.debug(f" Improvement focus: {focus}")
748
842
  logger.debug(f" Reasoning: {reasoning}")
749
843
  else:
@@ -752,17 +846,19 @@ class MetaPromptOptimizer(BaseOptimizer):
752
846
  )
753
847
 
754
848
  if not valid_prompts:
755
- raise ValueError("No valid prompts found in the parsed JSON response after validation.")
756
-
757
- candidate_generation_report.set_generated_prompts(
758
- self.num_prompts_per_round
759
- )
760
-
849
+ raise ValueError(
850
+ "No valid prompts found in the parsed JSON response after validation."
851
+ )
852
+
853
+ candidate_generation_report.set_generated_prompts()
854
+
761
855
  return valid_prompts
762
856
  # --- End Robust Parsing ---
763
857
 
764
858
  except Exception as e:
765
- raise ValueError(f"Unexpected error during candidate prompt generation: {e}")
859
+ raise ValueError(
860
+ f"Unexpected error during candidate prompt generation: {e}"
861
+ )
766
862
 
767
863
  def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
768
864
  """Build context from previous optimization rounds."""