opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -5
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +38 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +22 -13
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +89 -58
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +117 -14
  40. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.1.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.2.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,24 @@
1
- import copy
1
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
2
+
2
3
  import json
3
4
  import logging
4
5
  import random
5
6
  from datetime import datetime
6
- from typing import Any, Callable, Dict, List, Optional, Tuple
7
7
 
8
8
  import litellm
9
- import opik
10
9
  import optuna
11
10
  import optuna.samplers
11
+
12
+ import opik
12
13
  from opik import Dataset
13
14
  from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
14
15
  from pydantic import BaseModel
15
16
 
16
17
  from opik_optimizer import base_optimizer
17
- from opik_optimizer.optimization_config import mappers
18
-
18
+ from ..utils import create_litellm_agent_class
19
+ from ..optimization_config import chat_prompt, mappers
20
+ from ..optimizable_agent import OptimizableAgent
19
21
  from .. import _throttle, optimization_result, task_evaluator, utils
20
- from ..optimization_config import chat_prompt
21
22
  from . import reporting
22
23
 
23
24
  _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
@@ -53,37 +54,38 @@ Return your output as a JSON object with:
53
54
  Respond only with the JSON object. Do not include any explanation or extra text.
54
55
  """
55
56
 
57
+
56
58
  class FewShotPromptTemplate(BaseModel):
57
59
  message_list_with_placeholder: List[Dict[str, str]]
58
60
  example_template: str
59
61
 
62
+
60
63
  class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
61
64
  """
62
65
  The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
63
66
  employes a two stage pipeline:
64
67
 
65
- 1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
68
+ 1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
66
69
  provided
67
70
  2. We use Bayesian Optimization to determine the best examples to include in the prompt.
68
71
 
69
72
  This algorithm is best used when you have a well defined task and would like to guide the LLM
70
73
  by providing some examples.
71
74
  """
75
+
72
76
  def __init__(
73
77
  self,
74
78
  model: str,
75
- project_name: Optional[str] = "Optimization",
76
79
  min_examples: int = 2,
77
80
  max_examples: int = 8,
78
81
  seed: int = 42,
79
82
  n_threads: int = 8,
80
83
  verbose: int = 1,
81
- **model_kwargs,
84
+ **model_kwargs: Any,
82
85
  ) -> None:
83
86
  """
84
87
  Args:
85
88
  model: The model to used to evaluate the prompt
86
- project_name: Optional project name for tracking
87
89
  min_examples: Minimum number of examples to include
88
90
  max_examples: Maximum number of examples to include
89
91
  seed: Random seed for reproducibility
@@ -91,38 +93,55 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
91
93
  verbose: Controls internal logging/progress bars (0=off, 1=on).
92
94
  **model_kwargs: Additional model parameters
93
95
  """
94
- super().__init__(model, project_name, **model_kwargs)
96
+ super().__init__(model, verbose, **model_kwargs)
95
97
  self.min_examples = min_examples
96
98
  self.max_examples = max_examples
97
99
  self.seed = seed
98
100
  self.n_threads = n_threads
99
- self.verbose = verbose
100
- if verbose == 0:
101
+ if self.verbose == 0:
101
102
  logger.setLevel(logging.WARNING)
102
- elif verbose == 1:
103
+ elif self.verbose == 1:
103
104
  logger.setLevel(logging.INFO)
104
- elif verbose == 2:
105
+ elif self.verbose == 2:
105
106
  logger.setLevel(logging.DEBUG)
106
-
107
+
107
108
  self._opik_client = opik.Opik()
108
109
  self.llm_call_counter = 0
109
110
  logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
110
111
 
111
112
  @_throttle.rate_limited(_limiter)
112
- def _call_model(self, model, messages, seed, model_kwargs):
113
+ def _call_model(
114
+ self,
115
+ model: str,
116
+ messages: List[Dict[str, str]],
117
+ seed: int,
118
+ model_kwargs: Dict[str, Any],
119
+ ) -> Dict[str, Any]:
120
+ """
121
+ Args:
122
+ model: The model to use for the call
123
+ messages: List of message dictionaries with 'role' and 'content' keys
124
+ seed: Random seed for reproducibility
125
+ model_kwargs: Additional model parameters
126
+
127
+ Returns:
128
+ Dict containing the model's response
129
+ """
113
130
  self.llm_call_counter += 1
114
131
 
115
132
  current_model_kwargs = self.model_kwargs.copy()
116
133
  current_model_kwargs.update(model_kwargs)
117
134
 
118
135
  filtered_call_kwargs = current_model_kwargs.copy()
119
- filtered_call_kwargs.pop('n_trials', None)
120
- filtered_call_kwargs.pop('n_samples', None)
121
- filtered_call_kwargs.pop('n_iterations', None)
122
- filtered_call_kwargs.pop('min_examples', None)
123
- filtered_call_kwargs.pop('max_examples', None)
124
-
125
- final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
136
+ filtered_call_kwargs.pop("n_trials", None)
137
+ filtered_call_kwargs.pop("n_samples", None)
138
+ filtered_call_kwargs.pop("n_iterations", None)
139
+ filtered_call_kwargs.pop("min_examples", None)
140
+ filtered_call_kwargs.pop("max_examples", None)
141
+
142
+ final_params_for_litellm = (
143
+ opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
144
+ )
126
145
 
127
146
  response = litellm.completion(
128
147
  model=self.model,
@@ -136,6 +155,16 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
136
155
  def _split_dataset(
137
156
  self, dataset: List[Dict[str, Any]], train_ratio: float
138
157
  ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
158
+ """
159
+ Split the dataset into training and validation sets.
160
+
161
+ Args:
162
+ dataset: List of dataset items
163
+ train_ratio: Ratio of items to use for training
164
+
165
+ Returns:
166
+ Tuple of (train_set, validation_set)
167
+ """
139
168
  """Split the dataset into training and validation sets.
140
169
 
141
170
  Args:
@@ -159,48 +188,56 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
159
188
  self,
160
189
  model: str,
161
190
  prompt: chat_prompt.ChatPrompt,
162
- few_shot_examples: List[Dict[str, Any]]
191
+ few_shot_examples: List[Dict[str, Any]],
163
192
  ) -> FewShotPromptTemplate:
193
+ """
194
+ Generate a few-shot prompt template that can be used to insert examples into the prompt.
195
+
196
+ Args:
197
+ model: The model to use for generating the template
198
+ prompt: The base prompt to modify
199
+ few_shot_examples: List of example pairs with input and output fields
200
+
201
+ Returns:
202
+ FewShotPromptTemplate containing the modified message list and example template
203
+ """
164
204
  """
165
205
  During this step we update the system prompt to include few-shot examples.
166
206
  """
167
207
  user_message = {
168
- "message_list": prompt.formatted_messages,
169
- "examples": few_shot_examples
208
+ "message_list": prompt.get_messages(),
209
+ "examples": few_shot_examples,
170
210
  }
171
-
211
+
172
212
  messages: List[Dict[str, str]] = [
173
213
  {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
174
214
  {"role": "user", "content": json.dumps(user_message)},
175
215
  ]
176
-
216
+
177
217
  logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
178
- response = self._call_model(
179
- model,
180
- messages,
181
- self.seed,
182
- self.model_kwargs
183
- )
218
+ response = self._call_model(model, messages, self.seed, self.model_kwargs)
184
219
  logger.debug(f"fewshot_prompt_template - LLM response: {response}")
185
220
 
186
221
  try:
187
222
  res = utils.json_to_dict(response["choices"][0]["message"]["content"])
188
223
  return FewShotPromptTemplate(
189
224
  message_list_with_placeholder=res["message_list_with_placeholder"],
190
- example_template=res["example_template"]
225
+ example_template=res["example_template"],
191
226
  )
192
227
  except Exception as e:
193
- logger.error(f"Failed to compute few-shot prompt template: {e} - response: {response}")
228
+ logger.error(
229
+ f"Failed to compute few-shot prompt template: {e} - response: {response}"
230
+ )
194
231
  raise
195
232
 
196
233
  def _run_optimization(
197
234
  self,
198
- initial_prompt: chat_prompt.ChatPrompt,
235
+ prompt: chat_prompt.ChatPrompt,
199
236
  fewshot_prompt_template: FewShotPromptTemplate,
200
237
  dataset: Dataset,
201
238
  metric: Callable,
239
+ baseline_score: float,
202
240
  n_trials: int = 10,
203
- baseline_score: Optional[float] = None,
204
241
  optimization_id: Optional[str] = None,
205
242
  experiment_config: Optional[Dict] = None,
206
243
  n_samples: Optional[int] = None,
@@ -208,21 +245,22 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
208
245
  reporting.start_optimization_run(verbose=self.verbose)
209
246
 
210
247
  random.seed(self.seed)
211
- self.llm_call_counter = 0
212
-
248
+
213
249
  # Load the dataset
214
250
  dataset_items = dataset.get_items()
215
251
  all_dataset_item_ids = [item["id"] for item in dataset_items]
216
252
  eval_dataset_item_ids = all_dataset_item_ids
217
253
  if n_samples is not None and n_samples < len(dataset_items):
218
254
  eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
219
-
255
+
220
256
  # Define the experiment configuration
221
257
  experiment_config = experiment_config or {}
222
258
  base_experiment_config = { # Base config for reuse
223
259
  **experiment_config,
224
260
  **{
225
261
  "optimizer": self.__class__.__name__,
262
+ "agent_class": self.agent_class.__name__,
263
+ "agent_config": prompt.to_dict(),
226
264
  "metric": metric.__name__,
227
265
  "dataset": dataset.name,
228
266
  "configuration": {},
@@ -251,32 +289,44 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
251
289
  for key, value in example.items():
252
290
  processed_example[key] = str(value)
253
291
 
254
- processed_demo_example=fewshot_prompt_template.example_template
292
+ processed_demo_example = fewshot_prompt_template.example_template
255
293
  for key, value in processed_example.items():
256
294
  try:
257
- processed_demo_example=processed_demo_example.replace(f"{{{key}}}", str(value))
295
+ processed_demo_example = processed_demo_example.replace(
296
+ f"{{{key}}}", str(value)
297
+ )
258
298
  except Exception:
259
- logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
299
+ logger.error(
300
+ f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} "
301
+ )
260
302
  raise
261
303
  processed_demo_examples.append(processed_demo_example)
262
304
  few_shot_examples = "\n\n".join(processed_demo_examples)
263
-
305
+
264
306
  llm_task = self._build_task_from_messages(
307
+ prompt=prompt,
265
308
  messages=fewshot_prompt_template.message_list_with_placeholder,
266
- few_shot_examples=few_shot_examples
309
+ few_shot_examples=few_shot_examples,
267
310
  )
268
311
 
269
- messages_for_reporting = [{
270
- "role": item["role"],
271
- "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
272
- } for item in fewshot_prompt_template.message_list_with_placeholder]
312
+ messages_for_reporting = [
313
+ {
314
+ "role": item["role"],
315
+ "content": item["content"].replace(
316
+ FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
317
+ ),
318
+ }
319
+ for item in fewshot_prompt_template.message_list_with_placeholder
320
+ ]
273
321
 
274
322
  # Log trial config
275
323
  trial_config = base_experiment_config.copy()
276
- trial_config["configuration"]["prompt"] = messages_for_reporting # Base instruction
277
- trial_config["configuration"][
278
- "examples"
279
- ] = processed_demo_examples # Log stringified examples
324
+ trial_config["configuration"]["prompt"] = (
325
+ messages_for_reporting # Base instruction
326
+ )
327
+ trial_config["configuration"]["examples"] = (
328
+ processed_demo_examples # Log stringified examples
329
+ )
280
330
  trial_config["configuration"]["n_examples"] = n_examples
281
331
  trial_config["configuration"]["example_indices"] = example_indices
282
332
 
@@ -285,7 +335,9 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
285
335
  )
286
336
  logger.debug(f"Evaluating trial {trial.number}...")
287
337
 
288
- with reporting.start_optimization_trial(trial.number, n_trials, verbose=self.verbose) as trial_reporter:
338
+ with reporting.start_optimization_trial(
339
+ trial.number, n_trials, verbose=self.verbose
340
+ ) as trial_reporter:
289
341
  trial_reporter.start_trial(messages_for_reporting)
290
342
  score = task_evaluator.evaluate(
291
343
  dataset=dataset,
@@ -293,7 +345,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
293
345
  metric=metric,
294
346
  evaluated_task=llm_task,
295
347
  num_threads=self.n_threads,
296
- project_name=self.project_name,
348
+ project_name=self.agent_class.project_name,
297
349
  experiment_config=trial_config,
298
350
  optimization_id=optimization_id,
299
351
  verbose=self.verbose,
@@ -305,7 +357,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
305
357
  trial_config = {
306
358
  "demo_examples": demo_examples,
307
359
  "message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
308
- "message_list": messages_for_reporting
360
+ "message_list": messages_for_reporting,
309
361
  }
310
362
  trial.set_user_attr("score", score)
311
363
  trial.set_user_attr("config", trial_config)
@@ -327,41 +379,55 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
327
379
  # Explicitly create and seed the sampler for Optuna
328
380
  sampler = optuna.samplers.TPESampler(seed=self.seed)
329
381
  study = optuna.create_study(direction="maximize", sampler=sampler)
330
-
382
+
331
383
  study.optimize(
332
- optimization_objective,
333
- n_trials=n_trials,
334
- show_progress_bar=False
384
+ optimization_objective, n_trials=n_trials, show_progress_bar=False
335
385
  )
336
-
386
+
337
387
  optuna_history_processed = []
338
388
  for trial_idx, trial in enumerate(study.trials):
339
389
  if trial.state == optuna.trial.TrialState.COMPLETE:
340
390
  trial_config = trial.user_attrs.get("config", {})
341
- prompt_cand_display = trial_config.get('message_list') # Default to None
342
-
343
- score_val = trial.value # This can be None if trial failed to produce a score
391
+ prompt_cand_display = trial_config.get(
392
+ "message_list"
393
+ ) # Default to None
394
+
395
+ score_val = (
396
+ trial.value
397
+ ) # This can be None if trial failed to produce a score
344
398
  duration_val = None
345
399
  if trial.datetime_complete and trial.datetime_start:
346
- duration_val = (trial.datetime_complete - trial.datetime_start).total_seconds()
400
+ duration_val = (
401
+ trial.datetime_complete - trial.datetime_start
402
+ ).total_seconds()
347
403
 
348
404
  iter_detail = {
349
- "iteration": trial.number + 1,
350
- "timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
405
+ "iteration": trial.number + 1,
406
+ "timestamp": (
407
+ trial.datetime_start.isoformat()
408
+ if trial.datetime_start
409
+ else datetime.now().isoformat()
410
+ ),
351
411
  "prompt_candidate": prompt_cand_display,
352
- "parameters_used": {
353
- "optuna_params": trial.user_attrs.get("config", {}),
354
- "example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
412
+ "parameters_used": {
413
+ "optuna_params": trial.user_attrs.get("config", {}),
414
+ "example_indices": trial.user_attrs.get(
415
+ "example_indices", []
416
+ ), # Default to empty list
355
417
  },
356
- "scores": [{
357
- "metric_name": metric.__name__,
358
- "score": score_val, # Can be None
359
- }],
418
+ "scores": [
419
+ {
420
+ "metric_name": metric.__name__,
421
+ "score": score_val, # Can be None
422
+ }
423
+ ],
360
424
  "duration_seconds": duration_val,
361
425
  }
362
426
  optuna_history_processed.append(iter_detail)
363
427
  else:
364
- logger.warning(f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}")
428
+ logger.warning(
429
+ f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}"
430
+ )
365
431
 
366
432
  best_trial = study.best_trial
367
433
  best_score = best_trial.value
@@ -369,29 +435,33 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
369
435
 
370
436
  if best_score <= baseline_score:
371
437
  best_score = baseline_score
372
- best_prompt = initial_prompt.formatted_messages
438
+ best_prompt = prompt.get_messages()
373
439
  else:
374
440
  best_prompt = best_trial.user_attrs["config"]["message_list"]
375
441
 
376
442
  reporting.display_result(
377
443
  initial_score=baseline_score,
378
444
  best_score=best_score,
379
- best_prompt=best_trial.user_attrs["config"]["message_list"],
380
- verbose=self.verbose
445
+ best_prompt=best_prompt,
446
+ verbose=self.verbose,
381
447
  )
382
448
 
383
449
  return optimization_result.OptimizationResult(
384
450
  optimizer=self.__class__.__name__,
385
- prompt=best_trial.user_attrs["config"]["message_list"],
386
- initial_prompt=initial_prompt.formatted_messages,
451
+ prompt=best_prompt,
452
+ initial_prompt=prompt.get_messages(),
387
453
  initial_score=baseline_score,
388
454
  score=best_score,
389
455
  metric_name=metric.__name__,
390
456
  details={
391
457
  "initial_score": baseline_score,
392
- "chat_messages": best_trial.user_attrs["config"]["message_list"],
458
+ "chat_messages": (
459
+ best_trial.user_attrs["config"]["message_list"]
460
+ if best_trial.user_attrs["config"]
461
+ else []
462
+ ),
393
463
  "prompt_parameter": best_trial.user_attrs["config"],
394
- #"n_examples": best_n_examples,
464
+ # "n_examples": best_n_examples,
395
465
  "example_indices": best_example_indices,
396
466
  "trial_number": best_trial.number,
397
467
  "total_trials": n_trials,
@@ -402,39 +472,53 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
402
472
  "temperature": self.model_kwargs.get("temperature"),
403
473
  },
404
474
  history=optuna_history_processed,
405
- llm_calls=self.llm_call_counter
475
+ llm_calls=self.llm_call_counter,
476
+ dataset_id=dataset.id,
477
+ optimization_id=optimization_id,
406
478
  )
407
479
 
408
- def optimize_prompt( # type: ignore
480
+ def optimize_prompt( # type: ignore
409
481
  self,
410
482
  prompt: chat_prompt.ChatPrompt,
411
483
  dataset: Dataset,
412
484
  metric: Callable,
413
485
  n_trials: int = 10,
486
+ agent_class: Optional[Type[OptimizableAgent]] = None,
414
487
  experiment_config: Optional[Dict] = None,
415
488
  n_samples: Optional[int] = None,
416
489
  ) -> optimization_result.OptimizationResult:
417
490
  """
418
491
  Args:
419
- prompt: The prompt to optimize
492
+ prompt:
420
493
  dataset: Opik Dataset to optimize on
421
494
  metric: Metric function to evaluate on
422
495
  n_trials: Number of trials for Bayesian Optimization
423
496
  experiment_config: Optional configuration for the experiment, useful to log additional metadata
424
497
  n_samples: Optional number of items to test in the dataset
425
-
498
+
426
499
  Returns:
427
500
  OptimizationResult: Result of the optimization
428
501
  """
429
502
  if not isinstance(prompt, chat_prompt.ChatPrompt):
430
503
  raise ValueError("Prompt must be a ChatPrompt object")
431
-
504
+
432
505
  if not isinstance(dataset, Dataset):
433
506
  raise ValueError("Dataset must be a Dataset object")
434
-
435
- if not isinstance(metric, Callable):
436
- raise ValueError("Metric must be a function that takes `dataset_item` and `llm_output` as arguments.")
437
507
 
508
+ if not callable(metric):
509
+ raise ValueError(
510
+ "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
511
+ )
512
+
513
+ if prompt.model is None:
514
+ prompt.model = self.model
515
+ if prompt.model_kwargs is None:
516
+ prompt.model_kwargs = self.model_kwargs
517
+
518
+ if agent_class is None:
519
+ self.agent_class = create_litellm_agent_class(prompt)
520
+ else:
521
+ self.agent_class = agent_class
438
522
 
439
523
  optimization = None
440
524
  try:
@@ -451,87 +535,87 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
451
535
  optimization = None
452
536
  optimization_run_id = None
453
537
 
454
- try:
455
- # Start experiment reporting
456
- reporting.display_header(
457
- algorithm=self.__class__.__name__,
458
- optimization_id=optimization_run_id,
459
- dataset_id=dataset.id,
460
- verbose=self.verbose
461
- )
462
- reporting.display_configuration(
463
- prompt.formatted_messages,
464
- optimizer_config={
465
- "optimizer": self.__class__.__name__,
466
- "metric": metric.__name__,
467
- "n_trials": n_trials,
468
- "n_samples": n_samples
469
- },
470
- verbose=self.verbose
471
- )
472
-
473
- utils.disable_experiment_reporting()
474
-
475
- # Step 1. Compute the baseline evaluation
476
- with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
477
- baseline_score = self.evaluate_prompt(
478
- prompt=prompt,
479
- dataset=dataset,
480
- metric=metric,
481
- n_samples=n_samples,
482
- optimization_id=optimization.id if optimization is not None else None
483
- )
484
-
485
- eval_report.set_score(baseline_score)
486
-
487
- # Step 2. Create the few-shot prompt template
488
- with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
489
- fewshot_template = self._create_fewshot_prompt_template(
490
- model=self.model,
491
- prompt=prompt,
492
- few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
493
- for item in dataset.get_items(nb_samples=10)]
494
- )
538
+ # Start experiment reporting
539
+ reporting.display_header(
540
+ algorithm=self.__class__.__name__,
541
+ optimization_id=optimization_run_id,
542
+ dataset_id=dataset.id,
543
+ verbose=self.verbose,
544
+ )
545
+ reporting.display_configuration(
546
+ prompt.get_messages(),
547
+ optimizer_config={
548
+ "optimizer": self.__class__.__name__,
549
+ "metric": metric.__name__,
550
+ "n_trials": n_trials,
551
+ "n_samples": n_samples,
552
+ },
553
+ verbose=self.verbose,
554
+ )
495
555
 
496
- fewshot_template_report.set_fewshot_template(fewshot_template)
556
+ utils.disable_experiment_reporting()
497
557
 
498
- # Step 3. Start the optimization process
499
- result = self._run_optimization(
500
- initial_prompt=prompt,
501
- fewshot_prompt_template=fewshot_template,
558
+ # Step 1. Compute the baseline evaluation
559
+ with reporting.display_evaluation(
560
+ message="First we will establish the baseline performance:",
561
+ verbose=self.verbose,
562
+ ) as eval_report:
563
+ baseline_score = self._evaluate_prompt(
564
+ prompt,
502
565
  dataset=dataset,
503
566
  metric=metric,
504
- optimization_id=optimization.id if optimization is not None else None,
505
- experiment_config=experiment_config,
506
- n_trials=n_trials,
507
- baseline_score=baseline_score,
508
567
  n_samples=n_samples,
568
+ optimization_id=(optimization.id if optimization is not None else None),
509
569
  )
510
- if optimization:
511
- self.update_optimization(optimization, status="completed")
512
570
 
513
- utils.enable_experiment_reporting()
514
- return result
515
- except Exception as e:
516
- if optimization:
517
- self.update_optimization(optimization, status="cancelled")
518
- logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
519
- utils.enable_experiment_reporting()
520
- raise e
571
+ eval_report.set_score(baseline_score)
521
572
 
522
- def evaluate_prompt(
573
+ # Step 2. Create the few-shot prompt template
574
+ with reporting.creation_few_shot_prompt_template(
575
+ verbose=self.verbose
576
+ ) as fewshot_template_report:
577
+ fewshot_template = self._create_fewshot_prompt_template(
578
+ model=self.model,
579
+ prompt=prompt,
580
+ few_shot_examples=[
581
+ {k: v for k, v in item.items() if k != "id"}
582
+ for item in dataset.get_items(nb_samples=10)
583
+ ],
584
+ )
585
+
586
+ fewshot_template_report.set_fewshot_template(fewshot_template)
587
+
588
+ # Step 3. Start the optimization process
589
+ result = self._run_optimization(
590
+ prompt=prompt,
591
+ fewshot_prompt_template=fewshot_template,
592
+ dataset=dataset,
593
+ metric=metric,
594
+ baseline_score=baseline_score,
595
+ optimization_id=optimization.id if optimization is not None else None,
596
+ experiment_config=experiment_config,
597
+ n_trials=n_trials,
598
+ n_samples=n_samples,
599
+ )
600
+ if optimization:
601
+ self.update_optimization(optimization, status="completed")
602
+
603
+ utils.enable_experiment_reporting()
604
+ return result
605
+
606
+ def _evaluate_prompt(
523
607
  self,
524
608
  prompt: chat_prompt.ChatPrompt,
525
609
  dataset: opik.Dataset,
526
610
  metric: Callable,
611
+ n_samples: Optional[int] = None,
527
612
  dataset_item_ids: Optional[List[str]] = None,
528
613
  experiment_config: Optional[Dict] = None,
529
614
  optimization_id: Optional[str] = None,
530
- n_samples: Optional[int] = None,
615
+ **kwargs: Any,
531
616
  ) -> float:
532
617
  """
533
618
  Args:
534
- prompt: The prompt to evaluate
535
619
  dataset: Opik Dataset to evaluate the prompt on
536
620
  metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
537
621
  dataset_item_ids: Optional list of dataset item IDs to evaluate
@@ -541,27 +625,19 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
541
625
  Returns:
542
626
  float: The evaluation score
543
627
  """
544
- # Ensure prompt is correctly formatted
545
- if not all(
546
- isinstance(item, dict) and "role" in item and "content" in item
547
- for item in prompt.formatted_messages
548
- ):
549
- raise ValueError(
550
- "A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
551
- )
552
-
553
- llm_task = self._build_task_from_messages(prompt.formatted_messages)
628
+ llm_task = self._build_task_from_messages(prompt, prompt.get_messages())
554
629
 
555
630
  experiment_config = experiment_config or {}
631
+ experiment_config["project_name"] = self.agent_class.__name__
556
632
  experiment_config = {
557
633
  **experiment_config,
558
634
  **{
559
635
  "optimizer": self.__class__.__name__,
636
+ "agent_class": self.agent_class.__name__,
637
+ "agent_config": prompt.to_dict(),
560
638
  "metric": metric.__name__,
561
639
  "dataset": dataset.name,
562
- "configuration": {
563
- "prompt": prompt.formatted_messages,
564
- },
640
+ "configuration": {"prompt": prompt.get_messages()},
565
641
  },
566
642
  }
567
643
 
@@ -579,7 +655,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
579
655
  metric=metric,
580
656
  evaluated_task=llm_task,
581
657
  num_threads=self.n_threads,
582
- project_name=self.project_name,
658
+ project_name=self.agent_class.project_name,
583
659
  experiment_config=experiment_config,
584
660
  optimization_id=optimization_id,
585
661
  verbose=self.verbose,
@@ -588,29 +664,36 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
588
664
 
589
665
  return score
590
666
 
591
-
592
667
  def _build_task_from_messages(
593
- self, messages: List[Dict[str, str]], few_shot_examples: Optional[str] = None
594
- ):
668
+ self,
669
+ prompt: chat_prompt.ChatPrompt,
670
+ messages: List[Dict[str, str]],
671
+ few_shot_examples: Optional[str] = None,
672
+ ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
673
+ new_prompt = prompt.copy()
674
+ new_prompt.set_messages(messages)
675
+ agent = self.agent_class(new_prompt)
676
+
595
677
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
596
- prompt_ = copy.deepcopy(messages)
597
- for key, value in dataset_item.items():
598
- for item in prompt_:
599
- item["content"] = item["content"].replace("{" + key + "}", str(value))
678
+ """
679
+ Process a single dataset item through the LLM task.
680
+
681
+ Args:
682
+ dataset_item: Dictionary containing the dataset item data
683
+
684
+ Returns:
685
+ Dictionary containing the LLM's response
686
+ """
687
+ messages = new_prompt.get_messages(dataset_item)
600
688
 
601
689
  if few_shot_examples:
602
- for item in prompt_:
603
- item["content"] = item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
604
-
605
- response = self._call_model(
606
- model=self.model,
607
- messages=prompt_,
608
- seed=self.seed,
609
- model_kwargs=self.model_kwargs
610
- )
690
+ for message in messages:
691
+ message["content"] = message["content"].replace(
692
+ FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
693
+ )
611
694
 
612
- return {
613
- mappers.EVALUATED_LLM_TASK_OUTPUT: response.choices[0].message.content
614
- }
695
+ result = agent.invoke(messages, seed=self.seed)
696
+
697
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: result}
615
698
 
616
699
  return llm_task