opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -3
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +41 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +28 -20
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +96 -46
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +122 -37
  40. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.0.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.1.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,24 @@
1
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
2
+
1
3
  import json
2
4
  import logging
3
5
  import random
4
6
  from datetime import datetime
5
- from typing import Any, Callable, Dict, List, Optional, Tuple
6
7
 
7
8
  import litellm
8
- import opik
9
9
  import optuna
10
10
  import optuna.samplers
11
+
12
+ import opik
11
13
  from opik import Dataset
12
14
  from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
13
15
  from pydantic import BaseModel
14
16
 
15
17
  from opik_optimizer import base_optimizer
16
- from opik_optimizer.optimization_config import mappers
17
-
18
+ from ..utils import create_litellm_agent_class
19
+ from ..optimization_config import chat_prompt, mappers
20
+ from ..optimizable_agent import OptimizableAgent
18
21
  from .. import _throttle, optimization_result, task_evaluator, utils
19
- from ..optimization_config import chat_prompt
20
22
  from . import reporting
21
23
 
22
24
  _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
@@ -52,37 +54,38 @@ Return your output as a JSON object with:
52
54
  Respond only with the JSON object. Do not include any explanation or extra text.
53
55
  """
54
56
 
57
+
55
58
  class FewShotPromptTemplate(BaseModel):
56
59
  message_list_with_placeholder: List[Dict[str, str]]
57
60
  example_template: str
58
61
 
62
+
59
63
  class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
60
64
  """
61
65
  The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
62
66
  employes a two stage pipeline:
63
67
 
64
- 1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
68
+ 1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
65
69
  provided
66
70
  2. We use Bayesian Optimization to determine the best examples to include in the prompt.
67
71
 
68
72
  This algorithm is best used when you have a well defined task and would like to guide the LLM
69
73
  by providing some examples.
70
74
  """
75
+
71
76
  def __init__(
72
77
  self,
73
78
  model: str,
74
- project_name: Optional[str] = "Optimization",
75
79
  min_examples: int = 2,
76
80
  max_examples: int = 8,
77
81
  seed: int = 42,
78
82
  n_threads: int = 8,
79
83
  verbose: int = 1,
80
- **model_kwargs,
84
+ **model_kwargs: Any,
81
85
  ) -> None:
82
86
  """
83
87
  Args:
84
88
  model: The model to used to evaluate the prompt
85
- project_name: Optional project name for tracking
86
89
  min_examples: Minimum number of examples to include
87
90
  max_examples: Maximum number of examples to include
88
91
  seed: Random seed for reproducibility
@@ -90,38 +93,55 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
90
93
  verbose: Controls internal logging/progress bars (0=off, 1=on).
91
94
  **model_kwargs: Additional model parameters
92
95
  """
93
- super().__init__(model, project_name, **model_kwargs)
96
+ super().__init__(model, verbose, **model_kwargs)
94
97
  self.min_examples = min_examples
95
98
  self.max_examples = max_examples
96
99
  self.seed = seed
97
100
  self.n_threads = n_threads
98
- self.verbose = verbose
99
- if verbose == 0:
101
+ if self.verbose == 0:
100
102
  logger.setLevel(logging.WARNING)
101
- elif verbose == 1:
103
+ elif self.verbose == 1:
102
104
  logger.setLevel(logging.INFO)
103
- elif verbose == 2:
105
+ elif self.verbose == 2:
104
106
  logger.setLevel(logging.DEBUG)
105
-
107
+
106
108
  self._opik_client = opik.Opik()
107
109
  self.llm_call_counter = 0
108
110
  logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
109
111
 
110
112
  @_throttle.rate_limited(_limiter)
111
- def _call_model(self, model, messages, seed, model_kwargs):
113
+ def _call_model(
114
+ self,
115
+ model: str,
116
+ messages: List[Dict[str, str]],
117
+ seed: int,
118
+ model_kwargs: Dict[str, Any],
119
+ ) -> Dict[str, Any]:
120
+ """
121
+ Args:
122
+ model: The model to use for the call
123
+ messages: List of message dictionaries with 'role' and 'content' keys
124
+ seed: Random seed for reproducibility
125
+ model_kwargs: Additional model parameters
126
+
127
+ Returns:
128
+ Dict containing the model's response
129
+ """
112
130
  self.llm_call_counter += 1
113
131
 
114
132
  current_model_kwargs = self.model_kwargs.copy()
115
133
  current_model_kwargs.update(model_kwargs)
116
134
 
117
135
  filtered_call_kwargs = current_model_kwargs.copy()
118
- filtered_call_kwargs.pop('n_trials', None)
119
- filtered_call_kwargs.pop('n_samples', None)
120
- filtered_call_kwargs.pop('n_iterations', None)
121
- filtered_call_kwargs.pop('min_examples', None)
122
- filtered_call_kwargs.pop('max_examples', None)
123
-
124
- final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
136
+ filtered_call_kwargs.pop("n_trials", None)
137
+ filtered_call_kwargs.pop("n_samples", None)
138
+ filtered_call_kwargs.pop("n_iterations", None)
139
+ filtered_call_kwargs.pop("min_examples", None)
140
+ filtered_call_kwargs.pop("max_examples", None)
141
+
142
+ final_params_for_litellm = (
143
+ opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
144
+ )
125
145
 
126
146
  response = litellm.completion(
127
147
  model=self.model,
@@ -135,6 +155,16 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
135
155
  def _split_dataset(
136
156
  self, dataset: List[Dict[str, Any]], train_ratio: float
137
157
  ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
158
+ """
159
+ Split the dataset into training and validation sets.
160
+
161
+ Args:
162
+ dataset: List of dataset items
163
+ train_ratio: Ratio of items to use for training
164
+
165
+ Returns:
166
+ Tuple of (train_set, validation_set)
167
+ """
138
168
  """Split the dataset into training and validation sets.
139
169
 
140
170
  Args:
@@ -158,47 +188,56 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
158
188
  self,
159
189
  model: str,
160
190
  prompt: chat_prompt.ChatPrompt,
161
- few_shot_examples: List[Dict[str, Any]]
191
+ few_shot_examples: List[Dict[str, Any]],
162
192
  ) -> FewShotPromptTemplate:
193
+ """
194
+ Generate a few-shot prompt template that can be used to insert examples into the prompt.
195
+
196
+ Args:
197
+ model: The model to use for generating the template
198
+ prompt: The base prompt to modify
199
+ few_shot_examples: List of example pairs with input and output fields
200
+
201
+ Returns:
202
+ FewShotPromptTemplate containing the modified message list and example template
203
+ """
163
204
  """
164
205
  During this step we update the system prompt to include few-shot examples.
165
206
  """
166
207
  user_message = {
167
- "message_list": prompt.formatted_messages,
168
- "examples": few_shot_examples
208
+ "message_list": prompt.get_messages(),
209
+ "examples": few_shot_examples,
169
210
  }
170
-
211
+
171
212
  messages: List[Dict[str, str]] = [
172
213
  {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
173
214
  {"role": "user", "content": json.dumps(user_message)},
174
215
  ]
175
-
216
+
176
217
  logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
177
- response = self._call_model(
178
- model,
179
- messages,
180
- self.seed,
181
- self.model_kwargs
182
- )
218
+ response = self._call_model(model, messages, self.seed, self.model_kwargs)
183
219
  logger.debug(f"fewshot_prompt_template - LLM response: {response}")
184
220
 
185
221
  try:
186
222
  res = utils.json_to_dict(response["choices"][0]["message"]["content"])
187
223
  return FewShotPromptTemplate(
188
224
  message_list_with_placeholder=res["message_list_with_placeholder"],
189
- example_template=res["example_template"]
225
+ example_template=res["example_template"],
190
226
  )
191
227
  except Exception as e:
192
- logger.error(f"Failed to compute few-shot prompt template: {e} - response: {response}")
228
+ logger.error(
229
+ f"Failed to compute few-shot prompt template: {e} - response: {response}"
230
+ )
193
231
  raise
194
232
 
195
233
  def _run_optimization(
196
234
  self,
235
+ prompt: chat_prompt.ChatPrompt,
197
236
  fewshot_prompt_template: FewShotPromptTemplate,
198
237
  dataset: Dataset,
199
238
  metric: Callable,
239
+ baseline_score: float,
200
240
  n_trials: int = 10,
201
- baseline_score: Optional[float] = None,
202
241
  optimization_id: Optional[str] = None,
203
242
  experiment_config: Optional[Dict] = None,
204
243
  n_samples: Optional[int] = None,
@@ -206,21 +245,22 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
206
245
  reporting.start_optimization_run(verbose=self.verbose)
207
246
 
208
247
  random.seed(self.seed)
209
- self.llm_call_counter = 0
210
-
248
+
211
249
  # Load the dataset
212
250
  dataset_items = dataset.get_items()
213
251
  all_dataset_item_ids = [item["id"] for item in dataset_items]
214
252
  eval_dataset_item_ids = all_dataset_item_ids
215
253
  if n_samples is not None and n_samples < len(dataset_items):
216
254
  eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
217
-
255
+
218
256
  # Define the experiment configuration
219
257
  experiment_config = experiment_config or {}
220
258
  base_experiment_config = { # Base config for reuse
221
259
  **experiment_config,
222
260
  **{
223
261
  "optimizer": self.__class__.__name__,
262
+ "agent_class": self.agent_class.__name__,
263
+ "agent_config": prompt.to_dict(),
224
264
  "metric": metric.__name__,
225
265
  "dataset": dataset.name,
226
266
  "configuration": {},
@@ -249,31 +289,44 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
249
289
  for key, value in example.items():
250
290
  processed_example[key] = str(value)
251
291
 
252
- try:
253
- processed_demo_examples.append(
254
- fewshot_prompt_template.example_template.format(**processed_example)
255
- )
256
- except Exception:
257
- logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
258
- raise
292
+ processed_demo_example = fewshot_prompt_template.example_template
293
+ for key, value in processed_example.items():
294
+ try:
295
+ processed_demo_example = processed_demo_example.replace(
296
+ f"{{{key}}}", str(value)
297
+ )
298
+ except Exception:
299
+ logger.error(
300
+ f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} "
301
+ )
302
+ raise
303
+ processed_demo_examples.append(processed_demo_example)
259
304
  few_shot_examples = "\n\n".join(processed_demo_examples)
260
-
305
+
261
306
  llm_task = self._build_task_from_messages(
307
+ prompt=prompt,
262
308
  messages=fewshot_prompt_template.message_list_with_placeholder,
263
- few_shot_examples=few_shot_examples
309
+ few_shot_examples=few_shot_examples,
264
310
  )
265
311
 
266
- messages_for_reporting = [{
267
- "role": item["role"],
268
- "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
269
- } for item in fewshot_prompt_template.message_list_with_placeholder]
312
+ messages_for_reporting = [
313
+ {
314
+ "role": item["role"],
315
+ "content": item["content"].replace(
316
+ FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
317
+ ),
318
+ }
319
+ for item in fewshot_prompt_template.message_list_with_placeholder
320
+ ]
270
321
 
271
322
  # Log trial config
272
323
  trial_config = base_experiment_config.copy()
273
- trial_config["configuration"]["prompt"] = messages_for_reporting # Base instruction
274
- trial_config["configuration"][
275
- "examples"
276
- ] = processed_demo_examples # Log stringified examples
324
+ trial_config["configuration"]["prompt"] = (
325
+ messages_for_reporting # Base instruction
326
+ )
327
+ trial_config["configuration"]["examples"] = (
328
+ processed_demo_examples # Log stringified examples
329
+ )
277
330
  trial_config["configuration"]["n_examples"] = n_examples
278
331
  trial_config["configuration"]["example_indices"] = example_indices
279
332
 
@@ -282,7 +335,9 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
282
335
  )
283
336
  logger.debug(f"Evaluating trial {trial.number}...")
284
337
 
285
- with reporting.start_optimization_trial(trial.number, n_trials, verbose=self.verbose) as trial_reporter:
338
+ with reporting.start_optimization_trial(
339
+ trial.number, n_trials, verbose=self.verbose
340
+ ) as trial_reporter:
286
341
  trial_reporter.start_trial(messages_for_reporting)
287
342
  score = task_evaluator.evaluate(
288
343
  dataset=dataset,
@@ -290,7 +345,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
290
345
  metric=metric,
291
346
  evaluated_task=llm_task,
292
347
  num_threads=self.n_threads,
293
- project_name=self.project_name,
348
+ project_name=self.agent_class.project_name,
294
349
  experiment_config=trial_config,
295
350
  optimization_id=optimization_id,
296
351
  verbose=self.verbose,
@@ -302,7 +357,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
302
357
  trial_config = {
303
358
  "demo_examples": demo_examples,
304
359
  "message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
305
- "message_list": messages_for_reporting
360
+ "message_list": messages_for_reporting,
306
361
  }
307
362
  trial.set_user_attr("score", score)
308
363
  trial.set_user_attr("config", trial_config)
@@ -324,62 +379,89 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
324
379
  # Explicitly create and seed the sampler for Optuna
325
380
  sampler = optuna.samplers.TPESampler(seed=self.seed)
326
381
  study = optuna.create_study(direction="maximize", sampler=sampler)
327
-
382
+
328
383
  study.optimize(
329
- optimization_objective,
330
- n_trials=n_trials,
331
- show_progress_bar=False
384
+ optimization_objective, n_trials=n_trials, show_progress_bar=False
332
385
  )
333
-
386
+
334
387
  optuna_history_processed = []
335
388
  for trial_idx, trial in enumerate(study.trials):
336
389
  if trial.state == optuna.trial.TrialState.COMPLETE:
337
390
  trial_config = trial.user_attrs.get("config", {})
338
- prompt_cand_display = trial_config.get('message_list') # Default to None
339
-
340
- score_val = trial.value # This can be None if trial failed to produce a score
391
+ prompt_cand_display = trial_config.get(
392
+ "message_list"
393
+ ) # Default to None
394
+
395
+ score_val = (
396
+ trial.value
397
+ ) # This can be None if trial failed to produce a score
341
398
  duration_val = None
342
399
  if trial.datetime_complete and trial.datetime_start:
343
- duration_val = (trial.datetime_complete - trial.datetime_start).total_seconds()
400
+ duration_val = (
401
+ trial.datetime_complete - trial.datetime_start
402
+ ).total_seconds()
344
403
 
345
404
  iter_detail = {
346
- "iteration": trial.number + 1,
347
- "timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
405
+ "iteration": trial.number + 1,
406
+ "timestamp": (
407
+ trial.datetime_start.isoformat()
408
+ if trial.datetime_start
409
+ else datetime.now().isoformat()
410
+ ),
348
411
  "prompt_candidate": prompt_cand_display,
349
- "parameters_used": {
350
- "optuna_params": trial.user_attrs.get("config", {}),
351
- "example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
412
+ "parameters_used": {
413
+ "optuna_params": trial.user_attrs.get("config", {}),
414
+ "example_indices": trial.user_attrs.get(
415
+ "example_indices", []
416
+ ), # Default to empty list
352
417
  },
353
- "scores": [{
354
- "metric_name": metric.__name__,
355
- "score": score_val, # Can be None
356
- }],
418
+ "scores": [
419
+ {
420
+ "metric_name": metric.__name__,
421
+ "score": score_val, # Can be None
422
+ }
423
+ ],
357
424
  "duration_seconds": duration_val,
358
425
  }
359
426
  optuna_history_processed.append(iter_detail)
360
427
  else:
361
- logger.warning(f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}")
428
+ logger.warning(
429
+ f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}"
430
+ )
362
431
 
363
432
  best_trial = study.best_trial
364
433
  best_score = best_trial.value
365
434
  best_example_indices = best_trial.user_attrs.get("example_indices", [])
366
435
 
436
+ if best_score <= baseline_score:
437
+ best_score = baseline_score
438
+ best_prompt = prompt.get_messages()
439
+ else:
440
+ best_prompt = best_trial.user_attrs["config"]["message_list"]
441
+
367
442
  reporting.display_result(
368
443
  initial_score=baseline_score,
369
444
  best_score=best_score,
370
- best_prompt=best_trial.user_attrs["config"]["message_list"],
371
- verbose=self.verbose
445
+ best_prompt=best_prompt,
446
+ verbose=self.verbose,
372
447
  )
373
448
 
374
449
  return optimization_result.OptimizationResult(
375
450
  optimizer=self.__class__.__name__,
376
- prompt=best_trial.user_attrs["config"]["message_list"],
451
+ prompt=best_prompt,
452
+ initial_prompt=prompt.get_messages(),
453
+ initial_score=baseline_score,
377
454
  score=best_score,
378
455
  metric_name=metric.__name__,
379
456
  details={
380
- "chat_messages": best_trial.user_attrs["config"]["message_list"],
457
+ "initial_score": baseline_score,
458
+ "chat_messages": (
459
+ best_trial.user_attrs["config"]["message_list"]
460
+ if best_trial.user_attrs["config"]
461
+ else []
462
+ ),
381
463
  "prompt_parameter": best_trial.user_attrs["config"],
382
- #"n_examples": best_n_examples,
464
+ # "n_examples": best_n_examples,
383
465
  "example_indices": best_example_indices,
384
466
  "trial_number": best_trial.number,
385
467
  "total_trials": n_trials,
@@ -390,30 +472,54 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
390
472
  "temperature": self.model_kwargs.get("temperature"),
391
473
  },
392
474
  history=optuna_history_processed,
393
- llm_calls=self.llm_call_counter
475
+ llm_calls=self.llm_call_counter,
476
+ dataset_id=dataset.id,
477
+ optimization_id=optimization_id,
394
478
  )
395
479
 
396
- def optimize_prompt( # type: ignore
480
+ def optimize_prompt( # type: ignore
397
481
  self,
398
482
  prompt: chat_prompt.ChatPrompt,
399
483
  dataset: Dataset,
400
484
  metric: Callable,
401
485
  n_trials: int = 10,
486
+ agent_class: Optional[Type[OptimizableAgent]] = None,
402
487
  experiment_config: Optional[Dict] = None,
403
488
  n_samples: Optional[int] = None,
404
489
  ) -> optimization_result.OptimizationResult:
405
490
  """
406
491
  Args:
407
- prompt: The prompt to optimize
492
+ prompt:
408
493
  dataset: Opik Dataset to optimize on
409
494
  metric: Metric function to evaluate on
410
495
  n_trials: Number of trials for Bayesian Optimization
411
496
  experiment_config: Optional configuration for the experiment, useful to log additional metadata
412
497
  n_samples: Optional number of items to test in the dataset
413
-
498
+
414
499
  Returns:
415
500
  OptimizationResult: Result of the optimization
416
501
  """
502
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
503
+ raise ValueError("Prompt must be a ChatPrompt object")
504
+
505
+ if not isinstance(dataset, Dataset):
506
+ raise ValueError("Dataset must be a Dataset object")
507
+
508
+ if not callable(metric):
509
+ raise ValueError(
510
+ "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
511
+ )
512
+
513
+ if prompt.model is None:
514
+ prompt.model = self.model
515
+ if prompt.model_kwargs is None:
516
+ prompt.model_kwargs = self.model_kwargs
517
+
518
+ if agent_class is None:
519
+ self.agent_class = create_litellm_agent_class(prompt)
520
+ else:
521
+ self.agent_class = agent_class
522
+
417
523
  optimization = None
418
524
  try:
419
525
  optimization = self._opik_client.create_optimization(
@@ -421,87 +527,95 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
421
527
  objective_name=metric.__name__,
422
528
  metadata={"optimizer": self.__class__.__name__},
423
529
  )
530
+ optimization_run_id = optimization.id
424
531
  except Exception:
425
532
  logger.warning(
426
533
  "Opik server does not support optimizations. Please upgrade opik."
427
534
  )
428
535
  optimization = None
536
+ optimization_run_id = None
429
537
 
430
- try:
431
- # Start experiment reporting
432
- reporting.display_header("Few-Shot Bayesian Optimizer", verbose=self.verbose)
433
- reporting.display_configuration(
434
- prompt.formatted_messages,
435
- optimizer_config={
436
- "optimizer": self.__class__.__name__,
437
- "metric": metric.__name__,
438
- "n_trials": n_trials,
439
- "n_samples": n_samples
440
- },
441
- verbose=self.verbose
442
- )
443
-
444
- utils.disable_experiment_reporting()
445
-
446
- # Step 1. Compute the baseline evaluation
447
- with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
448
- baseline_score = self.evaluate_prompt(
449
- prompt=prompt,
450
- dataset=dataset,
451
- metric=metric,
452
- n_samples=n_samples,
453
- optimization_id=optimization.id if optimization is not None else None
454
- )
455
-
456
- eval_report.set_score(baseline_score)
457
-
458
- # Step 2. Create the few-shot prompt template
459
- with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
460
- fewshot_template = self._create_fewshot_prompt_template(
461
- model=self.model,
462
- prompt=prompt,
463
- few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
464
- for item in dataset.get_items(nb_samples=10)]
465
- )
538
+ # Start experiment reporting
539
+ reporting.display_header(
540
+ algorithm=self.__class__.__name__,
541
+ optimization_id=optimization_run_id,
542
+ dataset_id=dataset.id,
543
+ verbose=self.verbose,
544
+ )
545
+ reporting.display_configuration(
546
+ prompt.get_messages(),
547
+ optimizer_config={
548
+ "optimizer": self.__class__.__name__,
549
+ "metric": metric.__name__,
550
+ "n_trials": n_trials,
551
+ "n_samples": n_samples,
552
+ },
553
+ verbose=self.verbose,
554
+ )
466
555
 
467
- fewshot_template_report.set_fewshot_template(fewshot_template)
556
+ utils.disable_experiment_reporting()
468
557
 
469
- # Step 3. Start the optimization process
470
- result = self._run_optimization(
471
- fewshot_prompt_template=fewshot_template,
558
+ # Step 1. Compute the baseline evaluation
559
+ with reporting.display_evaluation(
560
+ message="First we will establish the baseline performance:",
561
+ verbose=self.verbose,
562
+ ) as eval_report:
563
+ baseline_score = self._evaluate_prompt(
564
+ prompt,
472
565
  dataset=dataset,
473
566
  metric=metric,
474
- optimization_id=optimization.id if optimization is not None else None,
475
- experiment_config=experiment_config,
476
- n_trials=n_trials,
477
- baseline_score=baseline_score,
478
567
  n_samples=n_samples,
568
+ optimization_id=(optimization.id if optimization is not None else None),
479
569
  )
480
- if optimization:
481
- self.update_optimization(optimization, status="completed")
482
570
 
483
- utils.enable_experiment_reporting()
484
- return result
485
- except Exception as e:
486
- if optimization:
487
- self.update_optimization(optimization, status="cancelled")
488
- logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
489
- utils.enable_experiment_reporting()
490
- raise e
571
+ eval_report.set_score(baseline_score)
491
572
 
492
- def evaluate_prompt(
573
+ # Step 2. Create the few-shot prompt template
574
+ with reporting.creation_few_shot_prompt_template(
575
+ verbose=self.verbose
576
+ ) as fewshot_template_report:
577
+ fewshot_template = self._create_fewshot_prompt_template(
578
+ model=self.model,
579
+ prompt=prompt,
580
+ few_shot_examples=[
581
+ {k: v for k, v in item.items() if k != "id"}
582
+ for item in dataset.get_items(nb_samples=10)
583
+ ],
584
+ )
585
+
586
+ fewshot_template_report.set_fewshot_template(fewshot_template)
587
+
588
+ # Step 3. Start the optimization process
589
+ result = self._run_optimization(
590
+ prompt=prompt,
591
+ fewshot_prompt_template=fewshot_template,
592
+ dataset=dataset,
593
+ metric=metric,
594
+ baseline_score=baseline_score,
595
+ optimization_id=optimization.id if optimization is not None else None,
596
+ experiment_config=experiment_config,
597
+ n_trials=n_trials,
598
+ n_samples=n_samples,
599
+ )
600
+ if optimization:
601
+ self.update_optimization(optimization, status="completed")
602
+
603
+ utils.enable_experiment_reporting()
604
+ return result
605
+
606
+ def _evaluate_prompt(
493
607
  self,
494
608
  prompt: chat_prompt.ChatPrompt,
495
609
  dataset: opik.Dataset,
496
610
  metric: Callable,
611
+ n_samples: Optional[int] = None,
497
612
  dataset_item_ids: Optional[List[str]] = None,
498
613
  experiment_config: Optional[Dict] = None,
499
614
  optimization_id: Optional[str] = None,
500
- n_samples: Optional[int] = None,
615
+ **kwargs: Any,
501
616
  ) -> float:
502
617
  """
503
618
  Args:
504
- prompt: The prompt to evaluate
505
619
  dataset: Opik Dataset to evaluate the prompt on
506
620
  metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
507
621
  dataset_item_ids: Optional list of dataset item IDs to evaluate
@@ -511,27 +625,19 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
511
625
  Returns:
512
626
  float: The evaluation score
513
627
  """
514
- # Ensure prompt is correctly formatted
515
- if not all(
516
- isinstance(item, dict) and "role" in item and "content" in item
517
- for item in prompt.formatted_messages
518
- ):
519
- raise ValueError(
520
- "A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
521
- )
522
-
523
- llm_task = self._build_task_from_messages(prompt.formatted_messages)
628
+ llm_task = self._build_task_from_messages(prompt, prompt.get_messages())
524
629
 
525
630
  experiment_config = experiment_config or {}
631
+ experiment_config["project_name"] = self.agent_class.__name__
526
632
  experiment_config = {
527
633
  **experiment_config,
528
634
  **{
529
635
  "optimizer": self.__class__.__name__,
636
+ "agent_class": self.agent_class.__name__,
637
+ "agent_config": prompt.to_dict(),
530
638
  "metric": metric.__name__,
531
639
  "dataset": dataset.name,
532
- "configuration": {
533
- "prompt": prompt.formatted_messages,
534
- },
640
+ "configuration": {"prompt": prompt.get_messages()},
535
641
  },
536
642
  }
537
643
 
@@ -549,7 +655,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
549
655
  metric=metric,
550
656
  evaluated_task=llm_task,
551
657
  num_threads=self.n_threads,
552
- project_name=self.project_name,
658
+ project_name=self.agent_class.project_name,
553
659
  experiment_config=experiment_config,
554
660
  optimization_id=optimization_id,
555
661
  verbose=self.verbose,
@@ -558,32 +664,36 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
558
664
 
559
665
  return score
560
666
 
561
-
562
667
  def _build_task_from_messages(
563
- self, messages: List[Dict[str, str]], few_shot_examples: Optional[str] = None
564
- ):
668
+ self,
669
+ prompt: chat_prompt.ChatPrompt,
670
+ messages: List[Dict[str, str]],
671
+ few_shot_examples: Optional[str] = None,
672
+ ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
673
+ new_prompt = prompt.copy()
674
+ new_prompt.set_messages(messages)
675
+ agent = self.agent_class(new_prompt)
676
+
565
677
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
566
- for key, value in dataset_item.items():
567
- prompt_ = [{
568
- "role": item["role"],
569
- "content": item["content"].replace("{" + key + "}", str(value))
570
- } for item in messages]
678
+ """
679
+ Process a single dataset item through the LLM task.
680
+
681
+ Args:
682
+ dataset_item: Dictionary containing the dataset item data
683
+
684
+ Returns:
685
+ Dictionary containing the LLM's response
686
+ """
687
+ messages = new_prompt.get_messages(dataset_item)
571
688
 
572
689
  if few_shot_examples:
573
- prompt_ = [{
574
- "role": item["role"],
575
- "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
576
- } for item in prompt_]
577
-
578
- response = self._call_model(
579
- model=self.model,
580
- messages=prompt_,
581
- seed=self.seed,
582
- model_kwargs=self.model_kwargs
583
- )
690
+ for message in messages:
691
+ message["content"] = message["content"].replace(
692
+ FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
693
+ )
584
694
 
585
- return {
586
- mappers.EVALUATED_LLM_TASK_OUTPUT: response.choices[0].message.content
587
- }
695
+ result = agent.invoke(messages, seed=self.seed)
696
+
697
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: result}
588
698
 
589
699
  return llm_task