opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -5
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +38 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +22 -13
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +89 -58
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +117 -14
  40. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.1.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.2.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
1
1
  import json
2
- import copy
3
2
  import logging
4
3
  import os
5
- from typing import Any, Callable, Dict, List, Optional, overload
4
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
6
5
 
7
6
  import litellm
8
7
  import opik
@@ -14,11 +13,13 @@ from opik.environment import get_tqdm_for_current_environment
14
13
  from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
15
14
 
16
15
  from opik_optimizer import task_evaluator
16
+ from opik_optimizer import utils
17
17
 
18
18
  from .. import _throttle
19
19
  from ..base_optimizer import BaseOptimizer, OptimizationRound
20
20
  from ..optimization_config import chat_prompt, mappers
21
21
  from ..optimization_result import OptimizationResult
22
+ from ..optimizable_agent import OptimizableAgent
22
23
  from . import reporting
23
24
 
24
25
  tqdm = get_tqdm_for_current_environment()
@@ -36,17 +37,18 @@ _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
36
37
  class MetaPromptOptimizer(BaseOptimizer):
37
38
  """
38
39
  The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
39
-
40
+
40
41
  This algorithm is best used when you have a prompt and would like to make sure it follows best
41
42
  practices.
42
43
  """
44
+
43
45
  # --- Constants for Default Configuration ---
44
46
  DEFAULT_ROUNDS = 3
45
47
  DEFAULT_PROMPTS_PER_ROUND = 4
46
48
 
47
49
  # --- Reasoning System Prompt ---
48
50
  _REASONING_SYSTEM_PROMPT = """You are an expert prompt engineer. Your task is to improve prompts for any type of task.
49
-
51
+
50
52
  Focus on making the prompt more effective by:
51
53
  1. Being clear and specific about what is expected
52
54
  2. Providing necessary context and constraints
@@ -84,11 +86,10 @@ class MetaPromptOptimizer(BaseOptimizer):
84
86
  rounds: int = DEFAULT_ROUNDS,
85
87
  num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
86
88
  num_threads: int = 12,
87
- project_name: str = "Optimization",
88
89
  verbose: int = 1,
89
90
  enable_context: bool = True,
90
- **model_kwargs,
91
- ):
91
+ **model_kwargs: Any,
92
+ ) -> None:
92
93
  """
93
94
  Args:
94
95
  model: The model to use for evaluation
@@ -96,18 +97,16 @@ class MetaPromptOptimizer(BaseOptimizer):
96
97
  rounds: Number of optimization rounds
97
98
  num_prompts_per_round: Number of prompts to generate per round
98
99
  num_threads: Number of threads for parallel evaluation
99
- project_name: Optional project name for tracking
100
100
  verbose: Controls internal logging/progress bars (0=off, 1=on).
101
101
  enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
102
102
  **model_kwargs: Additional model parameters
103
103
  """
104
- super().__init__(model=model, project_name=project_name, **model_kwargs)
104
+ super().__init__(model=model, verbose=verbose, **model_kwargs)
105
105
  self.reasoning_model = reasoning_model if reasoning_model is not None else model
106
106
  self.rounds = rounds
107
107
  self.num_prompts_per_round = num_prompts_per_round
108
108
  self.num_threads = num_threads
109
- self.verbose = verbose
110
- self.dataset = None
109
+ self.dataset: Optional[Dataset] = None
111
110
  self._opik_client = opik_client.get_client_cached()
112
111
  self.llm_call_counter = 0
113
112
  self.enable_context = enable_context
@@ -121,6 +120,7 @@ class MetaPromptOptimizer(BaseOptimizer):
121
120
  @_throttle.rate_limited(_rate_limiter)
122
121
  def _call_model(
123
122
  self,
123
+ project_name: str,
124
124
  messages: List[Dict[str, str]],
125
125
  is_reasoning: bool = False,
126
126
  optimization_id: Optional[str] = None,
@@ -134,12 +134,18 @@ class MetaPromptOptimizer(BaseOptimizer):
134
134
  base_max_tokens = getattr(self, "max_tokens", 1000)
135
135
 
136
136
  # Use potentially different settings for reasoning calls
137
- reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
137
+ reasoning_temperature = (
138
+ base_temperature # Keep same temp unless specified otherwise
139
+ )
138
140
  # Increase max_tokens for reasoning to ensure JSON fits, unless already high
139
- reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
141
+ reasoning_max_tokens = (
142
+ max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
143
+ )
140
144
 
141
145
  llm_config_params = {
142
- "temperature": reasoning_temperature if is_reasoning else base_temperature,
146
+ "temperature": (
147
+ reasoning_temperature if is_reasoning else base_temperature
148
+ ),
143
149
  "max_tokens": reasoning_max_tokens,
144
150
  "top_p": getattr(self, "top_p", 1.0),
145
151
  "frequency_penalty": getattr(self, "frequency_penalty", 0.0),
@@ -147,12 +153,12 @@ class MetaPromptOptimizer(BaseOptimizer):
147
153
  }
148
154
 
149
155
  # Prepare metadata that we want to be part of the LLM call context.
150
- metadata_for_opik = {}
151
- if self.project_name:
156
+ metadata_for_opik: Dict[str, Any] = {}
157
+ if project_name:
152
158
  metadata_for_opik["project_name"] = (
153
- self.project_name
154
- ) # Top-level for general use
155
- metadata_for_opik["opik"] = {"project_name": self.project_name}
159
+ project_name # Top-level for general use
160
+ )
161
+ metadata_for_opik["opik"] = {"project_name": project_name}
156
162
 
157
163
  if optimization_id:
158
164
  # Also add to opik-specific structure if project_name was added
@@ -182,10 +188,10 @@ class MetaPromptOptimizer(BaseOptimizer):
182
188
  )
183
189
 
184
190
  response = litellm.completion(
185
- model=model_to_use,
186
- messages=messages,
191
+ model=model_to_use,
192
+ messages=messages,
187
193
  num_retries=6,
188
- **final_call_params
194
+ **final_call_params,
189
195
  )
190
196
  return response.choices[0].message.content
191
197
  except litellm.exceptions.RateLimitError as e:
@@ -198,27 +204,26 @@ class MetaPromptOptimizer(BaseOptimizer):
198
204
  logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
199
205
  # Log prompt length if possible? Needs access to prompt_for_llm here.
200
206
  raise
201
- except Exception as e:
202
- logger.error(
203
- f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
204
- )
207
+ except Exception:
208
+ # logger.error(
209
+ # f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
210
+ # )
205
211
  raise
206
212
 
207
- # type: ignore
208
- def evaluate_prompt(
213
+ def _evaluate_prompt(
209
214
  self,
210
215
  prompt: chat_prompt.ChatPrompt,
211
216
  dataset: opik.Dataset,
212
217
  metric: Callable,
213
- use_full_dataset: bool = True,
214
- experiment_config: Optional[Dict] = None,
215
218
  n_samples: Optional[int] = None,
219
+ dataset_item_ids: Optional[List[str]] = None,
220
+ experiment_config: Optional[Dict] = None,
221
+ use_full_dataset: bool = True,
216
222
  optimization_id: Optional[str] = None,
217
- verbose: int = 1,
223
+ **kwargs: Any,
218
224
  ) -> float:
219
225
  """
220
226
  Args:
221
- prompt: The prompt to evaluate
222
227
  dataset: Opik Dataset to evaluate the prompt on
223
228
  metric: Metric functions
224
229
  use_full_dataset: Whether to use the full dataset or a subset
@@ -250,16 +255,18 @@ class MetaPromptOptimizer(BaseOptimizer):
250
255
  else:
251
256
  subset_size = None # Use all items for final checks
252
257
  logger.debug("Using full dataset for evaluation")
253
-
258
+
254
259
  experiment_config = experiment_config or {}
255
260
  experiment_config = {
256
261
  **experiment_config,
257
262
  **{
258
263
  "optimizer": self.__class__.__name__,
259
- "metric": getattr(metric, '__name__', str(metric)),
264
+ "agent_class": self.agent_class.__name__,
265
+ "agent_config": prompt.to_dict(),
266
+ "metric": getattr(metric, "__name__", str(metric)),
260
267
  "dataset": dataset.name,
261
268
  "configuration": {
262
- "prompt": prompt.formatted_messages,
269
+ "prompt": prompt.get_messages(),
263
270
  "n_samples": subset_size,
264
271
  "use_full_dataset": use_full_dataset,
265
272
  },
@@ -270,25 +277,33 @@ class MetaPromptOptimizer(BaseOptimizer):
270
277
 
271
278
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
272
279
  # --- Step 1: Prepare the prompt for the LLM ---
273
- messages = [{
274
- "role": item["role"],
275
- "content": item["content"].format(**dataset_item)
276
- } for item in prompt.formatted_messages]
280
+ # messages = [
281
+ # {
282
+ # "role": item["role"],
283
+ # "content": item["content"].format(**dataset_item),
284
+ # }
285
+ # for item in prompt.get_messages()
286
+ # ]
287
+ # Step 1: create the agent
288
+ new_prompt = prompt.copy()
289
+ messages = new_prompt.get_messages(dataset_item)
290
+ new_prompt.set_messages(messages)
291
+ agent = self.agent_class(new_prompt)
277
292
 
278
293
  # --- Step 2: Call the model ---
279
294
  try:
280
- logger.debug(f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}")
281
- raw_model_output = self._call_model(
282
- messages=messages,
283
- is_reasoning=False,
284
- optimization_id=optimization_id,
295
+ logger.debug(
296
+ f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
285
297
  )
298
+ raw_model_output = agent.invoke(messages)
286
299
  logger.debug(f"LLM raw response length: {len(raw_model_output)}")
287
300
  logger.debug(f"LLM raw output: {raw_model_output}")
288
301
  except Exception as e:
289
302
  logger.error(f"Error calling model with prompt: {e}")
290
303
  logger.error(f"Failed prompt: {messages}")
291
- logger.error(f"Prompt length: {sum(len(msg['content']) for msg in messages)}")
304
+ logger.error(
305
+ f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
306
+ )
292
307
  raise
293
308
 
294
309
  # --- Step 3: Clean the model's output before metric evaluation ---
@@ -307,8 +322,9 @@ class MetaPromptOptimizer(BaseOptimizer):
307
322
  dataset=dataset,
308
323
  metric=metric,
309
324
  evaluated_task=llm_task,
325
+ dataset_item_ids=dataset_item_ids,
310
326
  num_threads=self.num_threads,
311
- project_name=self.project_name,
327
+ project_name=self.agent_class.project_name,
312
328
  n_samples=subset_size, # Use subset_size for trials, None for full dataset
313
329
  experiment_config=experiment_config,
314
330
  optimization_id=optimization_id,
@@ -317,7 +333,7 @@ class MetaPromptOptimizer(BaseOptimizer):
317
333
  logger.debug(f"Evaluation score: {score:.4f}")
318
334
  return score
319
335
 
320
- def optimize_prompt( # type: ignore[override]
336
+ def optimize_prompt(
321
337
  self,
322
338
  prompt: chat_prompt.ChatPrompt,
323
339
  dataset: Dataset,
@@ -325,13 +341,13 @@ class MetaPromptOptimizer(BaseOptimizer):
325
341
  experiment_config: Optional[Dict] = None,
326
342
  n_samples: Optional[int] = None,
327
343
  auto_continue: bool = False,
328
- **kwargs,
344
+ agent_class: Optional[Type[OptimizableAgent]] = None,
345
+ **kwargs: Any,
329
346
  ) -> OptimizationResult:
330
347
  """
331
348
  Optimize a prompt using meta-reasoning.
332
349
 
333
350
  Args:
334
- prompt: The prompt to optimize
335
351
  dataset: The dataset to evaluate against
336
352
  metric: The metric to use for evaluation
337
353
  experiment_config: A dictionary to log with the experiments
@@ -344,12 +360,24 @@ class MetaPromptOptimizer(BaseOptimizer):
344
360
  """
345
361
  if not isinstance(prompt, chat_prompt.ChatPrompt):
346
362
  raise ValueError("Prompt must be a ChatPrompt object")
347
-
363
+
348
364
  if not isinstance(dataset, Dataset):
349
365
  raise ValueError("Dataset must be a Dataset object")
350
-
351
- if not isinstance(metric, Callable):
352
- raise ValueError("Metric must be a function that takes `dataset_item` and `llm_output` as arguments.")
366
+
367
+ if not callable(metric):
368
+ raise ValueError(
369
+ "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
370
+ )
371
+
372
+ if prompt.model is None:
373
+ prompt.model = self.model
374
+ if prompt.model_kwargs is None:
375
+ prompt.model_kwargs = self.model_kwargs
376
+
377
+ if agent_class is None:
378
+ self.agent_class = utils.create_litellm_agent_class(prompt)
379
+ else:
380
+ self.agent_class = agent_class
353
381
 
354
382
  total_items = len(dataset.get_items())
355
383
  if n_samples is not None and n_samples > total_items:
@@ -358,12 +386,11 @@ class MetaPromptOptimizer(BaseOptimizer):
358
386
  )
359
387
  n_samples = None
360
388
 
361
-
362
389
  optimization = None
363
390
  try:
364
391
  optimization = self._opik_client.create_optimization(
365
392
  dataset_name=dataset.name,
366
- objective_name=getattr(metric, '__name__', str(metric)),
393
+ objective_name=getattr(metric, "__name__", str(metric)),
367
394
  metadata={"optimizer": self.__class__.__name__},
368
395
  )
369
396
  logger.debug(f"Created optimization with ID: {optimization.id}")
@@ -377,21 +404,22 @@ class MetaPromptOptimizer(BaseOptimizer):
377
404
  algorithm=self.__class__.__name__,
378
405
  optimization_id=optimization.id if optimization is not None else None,
379
406
  dataset_id=dataset.id,
380
- verbose=self.verbose
407
+ verbose=self.verbose,
381
408
  )
382
409
  reporting.display_configuration(
383
- messages=prompt.formatted_messages,
410
+ messages=prompt.get_messages(),
384
411
  optimizer_config={
385
412
  "optimizer": self.__class__.__name__,
386
413
  "n_samples": n_samples,
387
- "auto_continue": auto_continue
414
+ "auto_continue": auto_continue,
388
415
  },
389
- verbose=self.verbose
416
+ verbose=self.verbose,
390
417
  )
391
418
 
392
419
  try:
420
+ optimization_id = optimization.id if optimization is not None else None
393
421
  result = self._optimize_prompt(
394
- optimization_id=optimization.id if optimization is not None else None,
422
+ optimization_id=optimization_id,
395
423
  prompt=prompt,
396
424
  dataset=dataset,
397
425
  metric=metric,
@@ -413,31 +441,33 @@ class MetaPromptOptimizer(BaseOptimizer):
413
441
 
414
442
  def _optimize_prompt(
415
443
  self,
416
- optimization_id: str,
444
+ optimization_id: Optional[str],
417
445
  prompt: chat_prompt.ChatPrompt,
418
446
  dataset: Dataset,
419
447
  metric: Callable,
420
448
  experiment_config: Optional[Dict],
421
- n_samples: int,
449
+ n_samples: Optional[int],
422
450
  auto_continue: bool,
423
- **kwargs,
451
+ **kwargs: Any,
424
452
  ) -> OptimizationResult:
425
453
  self.auto_continue = auto_continue
426
454
  self.dataset = dataset
427
455
  self.prompt = prompt
428
- self.llm_call_counter = 0 # Reset counter for run
429
- initial_prompt: List[Dict[str, str]] = prompt.formatted_messages
456
+ self.llm_call_counter = 0 # Reset counter for run
457
+ initial_prompt = prompt
430
458
 
431
- current_prompt = prompt.formatted_messages
459
+ current_prompt = prompt
432
460
  experiment_config = experiment_config or {}
433
461
  experiment_config = {
434
462
  **experiment_config,
435
463
  **{
436
464
  "optimizer": self.__class__.__name__,
437
- "metric": getattr(metric, '__name__', str(metric)),
438
- "dataset": self.dataset.name,
465
+ "agent_class": self.agent_class.__name__,
466
+ "agent_config": prompt.to_dict(),
467
+ "metric": getattr(metric, "__name__", str(metric)),
468
+ "dataset": dataset.name,
439
469
  "configuration": {
440
- "prompt": current_prompt,
470
+ "prompt": prompt.get_messages(),
441
471
  "rounds": self.rounds,
442
472
  "num_prompts_per_round": self.num_prompts_per_round,
443
473
  },
@@ -445,8 +475,8 @@ class MetaPromptOptimizer(BaseOptimizer):
445
475
  }
446
476
 
447
477
  with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
448
- initial_score = self.evaluate_prompt(
449
- prompt=prompt,
478
+ initial_score = self._evaluate_prompt(
479
+ prompt,
450
480
  optimization_id=optimization_id,
451
481
  dataset=dataset,
452
482
  metric=metric,
@@ -457,20 +487,22 @@ class MetaPromptOptimizer(BaseOptimizer):
457
487
  )
458
488
  best_score = initial_score
459
489
  best_prompt = current_prompt
460
- rounds = []
490
+ rounds: List[OptimizationRound] = []
461
491
 
462
492
  baseline_reporter.set_score(initial_score)
463
493
 
464
494
  reporting.display_optimization_start_message(verbose=self.verbose)
465
- with reporting.display_round_progress(self.rounds, verbose=self.verbose) as round_reporter:
495
+ with reporting.display_round_progress(
496
+ self.rounds, verbose=self.verbose
497
+ ) as round_reporter:
466
498
  for round_num in range(self.rounds):
467
-
468
499
  round_reporter.round_start(round_num)
469
500
  previous_best_score = best_score
470
-
501
+
471
502
  # Step 1. Create a set of candidate prompts
472
503
  try:
473
504
  candidate_prompts = self._generate_candidate_prompts(
505
+ project_name=self.agent_class.project_name,
474
506
  current_prompt=best_prompt,
475
507
  best_score=best_score,
476
508
  round_num=round_num,
@@ -483,14 +515,19 @@ class MetaPromptOptimizer(BaseOptimizer):
483
515
  continue
484
516
 
485
517
  # Step 2. Score each candidate prompt
486
- prompt_scores = []
518
+ prompt_scores: List[Tuple[chat_prompt.ChatPrompt, float]] = []
487
519
  for candidate_count, prompt in enumerate(candidate_prompts):
488
- with reporting.display_prompt_candidate_scoring_report(candidate_count, prompt, verbose=self.verbose) as eval_report:
520
+ with reporting.display_prompt_candidate_scoring_report(
521
+ verbose=self.verbose
522
+ ) as eval_report:
489
523
  eval_report.set_generated_prompts(candidate_count, prompt)
490
524
 
525
+ new_prompt = current_prompt.copy()
526
+ new_prompt.set_messages(prompt.get_messages())
527
+
491
528
  try:
492
- prompt_score = self.evaluate_prompt(
493
- prompt=chat_prompt.ChatPrompt(messages=prompt),
529
+ prompt_score = self._evaluate_prompt(
530
+ prompt=new_prompt,
494
531
  optimization_id=optimization_id,
495
532
  dataset=dataset,
496
533
  metric=metric,
@@ -501,34 +538,37 @@ class MetaPromptOptimizer(BaseOptimizer):
501
538
  )
502
539
 
503
540
  eval_report.set_final_score(best_score, prompt_score)
504
- except Exception as e:
505
- raise ValueError(f"Error evaluating candidate prompt: {e}")
541
+ except Exception:
542
+ print("Failed evaluating agent; continuing...")
543
+ prompt_score = 0
506
544
 
507
545
  prompt_scores.append((prompt, prompt_score))
508
-
546
+
509
547
  # Step 3. Identify potential improvements
510
548
  if not prompt_scores:
511
- logger.warning("No prompts were successfully evaluated in this round")
549
+ logger.warning(
550
+ "No prompts were successfully evaluated in this round"
551
+ )
512
552
  break
513
553
 
514
554
  prompt_scores.sort(key=lambda x: x[1], reverse=True)
515
- best_candidate_this_round, best_cand_score_avg = (
516
- prompt_scores[0]
555
+ best_candidate_this_round, best_cand_score_avg = prompt_scores[0]
556
+ improvement = self._calculate_improvement(
557
+ best_cand_score_avg, best_score
517
558
  )
518
- improvement = self._calculate_improvement(best_cand_score_avg, best_score)
519
- round_reporter.round_end(round_num, best_cand_score_avg, best_score, best_prompt)
520
-
559
+ round_reporter.round_end(round_num, best_cand_score_avg, best_score)
560
+
521
561
  round_data = self._create_round_data(
522
562
  round_num=round_num,
523
- current_best_prompt=chat_prompt.ChatPrompt(messages=best_candidate_this_round),
524
- current_best_score=best_cand_score_avg,
525
- best_prompt_overall=chat_prompt.ChatPrompt(messages=best_prompt),
563
+ current_best_prompt=best_prompt,
564
+ current_best_score=best_score,
565
+ best_prompt_overall=best_prompt,
526
566
  evaluated_candidates=prompt_scores,
527
567
  previous_best_score=previous_best_score,
528
568
  improvement_this_round=improvement,
529
569
  )
530
570
  rounds.append(round_data)
531
- self._add_to_history(round_data.model_dump())
571
+ self._add_to_history(round_data)
532
572
 
533
573
  if improvement > 0:
534
574
  best_score = best_cand_score_avg
@@ -537,17 +577,21 @@ class MetaPromptOptimizer(BaseOptimizer):
537
577
  reporting.display_result(
538
578
  initial_score,
539
579
  best_score,
540
- best_prompt,
541
- verbose=self.verbose
580
+ best_prompt.get_messages() if best_prompt is not None else [],
581
+ verbose=self.verbose,
542
582
  )
543
583
 
544
584
  return self._create_result(
545
585
  metric,
546
- initial_prompt=initial_prompt,
547
- best_prompt=best_prompt,
586
+ initial_prompt=initial_prompt.get_messages()
587
+ if initial_prompt is not None
588
+ else [],
589
+ best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
548
590
  best_score=best_score,
549
591
  initial_score=initial_score,
550
592
  rounds=rounds,
593
+ dataset_id=dataset.id,
594
+ optimization_id=optimization_id,
551
595
  )
552
596
 
553
597
  def _calculate_improvement(
@@ -566,7 +610,7 @@ class MetaPromptOptimizer(BaseOptimizer):
566
610
  current_best_prompt: chat_prompt.ChatPrompt,
567
611
  current_best_score: float,
568
612
  best_prompt_overall: chat_prompt.ChatPrompt,
569
- evaluated_candidates: List[tuple[str, float, List[float]]],
613
+ evaluated_candidates: List[Tuple[chat_prompt.ChatPrompt, float]],
570
614
  previous_best_score: float,
571
615
  improvement_this_round: float,
572
616
  ) -> OptimizationRound:
@@ -578,7 +622,7 @@ class MetaPromptOptimizer(BaseOptimizer):
578
622
  )
579
623
  generated_prompts_log.append(
580
624
  {
581
- "prompt": prompt,
625
+ "prompt": prompt.get_messages(),
582
626
  "score": score,
583
627
  "improvement": improvement_vs_prev,
584
628
  }
@@ -602,6 +646,8 @@ class MetaPromptOptimizer(BaseOptimizer):
602
646
  best_score: float,
603
647
  initial_score: float,
604
648
  rounds: List[OptimizationRound],
649
+ dataset_id: Optional[str],
650
+ optimization_id: Optional[str],
605
651
  ) -> OptimizationResult:
606
652
  """Create the final OptimizationResult object."""
607
653
  details = {
@@ -609,7 +655,7 @@ class MetaPromptOptimizer(BaseOptimizer):
609
655
  "final_score": best_score,
610
656
  "rounds": rounds,
611
657
  "total_rounds": len(rounds),
612
- "metric_name": getattr(metric, '__name__', str(metric)),
658
+ "metric_name": getattr(metric, "__name__", str(metric)),
613
659
  "model": self.model,
614
660
  "temperature": self.model_kwargs.get("temperature"),
615
661
  }
@@ -620,9 +666,11 @@ class MetaPromptOptimizer(BaseOptimizer):
620
666
  score=best_score,
621
667
  initial_prompt=initial_prompt,
622
668
  initial_score=initial_score,
623
- metric_name=getattr(metric, '__name__', str(metric)),
669
+ metric_name=getattr(metric, "__name__", str(metric)),
624
670
  details=details,
625
- llm_calls=self.llm_call_counter
671
+ llm_calls=self.llm_call_counter,
672
+ dataset_id=dataset_id,
673
+ optimization_id=optimization_id,
626
674
  )
627
675
 
628
676
  def _get_task_context(self, metric: Callable) -> str:
@@ -648,7 +696,7 @@ class MetaPromptOptimizer(BaseOptimizer):
648
696
  context += f"Dataset fields (includes both input and optionally the expected output): {', '.join([x for x in sample.keys() if x != 'id'])}\n"
649
697
  context += f"Evaluation Metric:\n{metrics_str}\n"
650
698
  context += f"\nExample:\n{json.dumps(sample)}\n"
651
-
699
+
652
700
  return context
653
701
 
654
702
  def _generate_candidate_prompts(
@@ -659,14 +707,14 @@ class MetaPromptOptimizer(BaseOptimizer):
659
707
  previous_rounds: List[OptimizationRound],
660
708
  metric: Callable,
661
709
  optimization_id: Optional[str] = None,
662
- ) -> List[str]:
710
+ project_name: Optional[str] = None,
711
+ ) -> List[chat_prompt.ChatPrompt]:
663
712
  """Generate candidate prompts using meta-prompting."""
664
713
  with reporting.display_candidate_generation_report(
665
- self.num_prompts_per_round,
666
- verbose=self.verbose
667
- ) as candidate_generation_report:
714
+ self.num_prompts_per_round, verbose=self.verbose
715
+ ) as candidate_generation_report:
668
716
  logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
669
- logger.debug(f"Generating from prompt: {current_prompt}")
717
+ logger.debug(f"Generating from prompt: {current_prompt.get_messages()}")
670
718
  logger.debug(f"Current best score: {best_score:.4f}")
671
719
 
672
720
  history_context = self._build_history_context(previous_rounds)
@@ -678,16 +726,22 @@ class MetaPromptOptimizer(BaseOptimizer):
678
726
  if self.enable_context:
679
727
  task_context_str = self._get_task_context(metric=metric)
680
728
  analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
681
- metric_focus_instruction = f"Focus on improving the score for the metric: {metric.__name__}."
729
+ metric_focus_instruction = (
730
+ f"Focus on improving the score for the metric: {metric.__name__}."
731
+ )
682
732
  improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
683
- logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
733
+ logger.debug(
734
+ "Task context and metric-specific instructions enabled for reasoning prompt."
735
+ )
684
736
  else:
685
- analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
737
+ analysis_instruction = "Analyze the history of scores and the current prompt's performance."
686
738
  metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
687
739
  improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
688
- logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
740
+ logger.debug(
741
+ "Task context and metric-specific instructions disabled for reasoning prompt."
742
+ )
689
743
 
690
- user_prompt = f"""Current prompt: {current_prompt}
744
+ user_prompt = f"""Current prompt: {current_prompt.get_messages()}
691
745
  Current score: {best_score}
692
746
  {history_context}
693
747
  {task_context_str}
@@ -707,9 +761,10 @@ class MetaPromptOptimizer(BaseOptimizer):
707
761
  try:
708
762
  # Use _call_model which handles selecting reasoning_model
709
763
  content = self._call_model(
764
+ project_name,
710
765
  messages=[
711
766
  {"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
712
- {"role": "user", "content": user_prompt}
767
+ {"role": "user", "content": user_prompt},
713
768
  ],
714
769
  is_reasoning=True,
715
770
  optimization_id=optimization_id,
@@ -729,9 +784,13 @@ class MetaPromptOptimizer(BaseOptimizer):
729
784
  try:
730
785
  json_result = json.loads(json_match.group())
731
786
  except json.JSONDecodeError as e:
732
- raise ValueError(f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}")
787
+ raise ValueError(
788
+ f"Could not parse JSON extracted via regex: {e} - received: {json_match.group()}"
789
+ )
733
790
  else:
734
- raise ValueError(f"No JSON object found in response via regex. - received: {content}")
791
+ raise ValueError(
792
+ f"No JSON object found in response via regex. - received: {content}"
793
+ )
735
794
 
736
795
  # Validate the parsed JSON structure
737
796
  if isinstance(json_result, list) and len(json_result) == 1:
@@ -739,27 +798,46 @@ class MetaPromptOptimizer(BaseOptimizer):
739
798
 
740
799
  if not isinstance(json_result, dict) or "prompts" not in json_result:
741
800
  logger.debug(f"Parsed JSON content: {json_result}")
742
- raise ValueError(f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}")
801
+ raise ValueError(
802
+ f"Parsed JSON is not a dictionary or missing 'prompts' key. - received: {json_result}"
803
+ )
743
804
 
744
805
  if not isinstance(json_result["prompts"], list):
745
806
  logger.debug(f"Content of 'prompts': {json_result.get('prompts')}")
746
- raise ValueError(f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}")
807
+ raise ValueError(
808
+ f"'prompts' key does not contain a list. - received: {json_result.get('prompts')}"
809
+ )
747
810
 
748
811
  # Extract and log valid prompts
749
- valid_prompts = []
812
+ valid_prompts: List[chat_prompt.ChatPrompt] = []
750
813
  for item in json_result["prompts"]:
751
814
  if (
752
815
  isinstance(item, dict)
753
816
  and "prompt" in item
754
817
  and isinstance(item["prompt"], list)
755
818
  ):
756
- prompt_text = item["prompt"]
757
- valid_prompts.append(prompt_text)
758
-
819
+ # NOTE: might be brittle
820
+ if current_prompt.user:
821
+ user_text = current_prompt.user
822
+ else:
823
+ if current_prompt.messages is not None:
824
+ user_text = current_prompt.messages[-1]["content"]
825
+ else:
826
+ raise Exception(
827
+ "User content not found in chat-prompt!"
828
+ )
829
+
830
+ valid_prompts.append(
831
+ chat_prompt.ChatPrompt(
832
+ system=item["prompt"][0]["content"],
833
+ user=user_text,
834
+ )
835
+ )
836
+
759
837
  # Log details
760
838
  focus = item.get("improvement_focus", "N/A")
761
839
  reasoning = item.get("reasoning", "N/A")
762
- logger.debug(f"Generated prompt: {prompt_text}")
840
+ logger.debug(f"Generated prompt: {item['prompt']}")
763
841
  logger.debug(f" Improvement focus: {focus}")
764
842
  logger.debug(f" Reasoning: {reasoning}")
765
843
  else:
@@ -768,17 +846,19 @@ class MetaPromptOptimizer(BaseOptimizer):
768
846
  )
769
847
 
770
848
  if not valid_prompts:
771
- raise ValueError("No valid prompts found in the parsed JSON response after validation.")
772
-
773
- candidate_generation_report.set_generated_prompts(
774
- self.num_prompts_per_round
775
- )
776
-
849
+ raise ValueError(
850
+ "No valid prompts found in the parsed JSON response after validation."
851
+ )
852
+
853
+ candidate_generation_report.set_generated_prompts()
854
+
777
855
  return valid_prompts
778
856
  # --- End Robust Parsing ---
779
857
 
780
858
  except Exception as e:
781
- raise ValueError(f"Unexpected error during candidate prompt generation: {e}")
859
+ raise ValueError(
860
+ f"Unexpected error during candidate prompt generation: {e}"
861
+ )
782
862
 
783
863
  def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
784
864
  """Build context from previous optimization rounds."""