opik-optimizer 0.8.1__py3-none-any.whl → 0.9.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. opik_optimizer/__init__.py +15 -26
  2. opik_optimizer/base_optimizer.py +28 -44
  3. opik_optimizer/datasets/__init__.py +6 -7
  4. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
  5. opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
  6. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +296 -194
  7. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
  8. opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
  9. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
  10. opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
  11. opik_optimizer/mipro_optimizer/__init__.py +1 -1
  12. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
  13. opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
  14. opik_optimizer/mipro_optimizer/utils.py +1 -23
  15. opik_optimizer/optimization_config/chat_prompt.py +106 -0
  16. opik_optimizer/optimization_config/configs.py +2 -21
  17. opik_optimizer/optimization_config/mappers.py +1 -1
  18. opik_optimizer/optimization_result.py +57 -85
  19. opik_optimizer/reporting_utils.py +180 -0
  20. opik_optimizer/task_evaluator.py +33 -25
  21. opik_optimizer/utils.py +187 -3
  22. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0rc0.dist-info}/METADATA +15 -31
  23. opik_optimizer-0.9.0rc0.dist-info/RECORD +48 -0
  24. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0rc0.dist-info}/WHEEL +1 -1
  25. opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
  26. opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
  27. opik_optimizer/integrations/__init__.py +0 -0
  28. opik_optimizer/meta_prompt_optimizer.py +0 -1151
  29. opik_optimizer-0.8.1.dist-info/RECORD +0 -45
  30. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0rc0.dist-info}/licenses/LICENSE +0 -0
  31. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0rc0.dist-info}/top_level.txt +0 -0
@@ -1,54 +1,107 @@
1
+ import json
2
+ import logging
1
3
  import random
2
- from typing import Any, Dict, List, Tuple, Union, Optional, Literal
4
+ from datetime import datetime
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple
6
+
7
+ import litellm
3
8
  import opik
4
9
  import optuna
5
10
  import optuna.samplers
6
- import logging
7
- import json
8
- from datetime import datetime
9
-
10
11
  from opik import Dataset
11
- from opik_optimizer.optimization_config import mappers
12
+ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
13
+ from pydantic import BaseModel
12
14
 
13
- from opik_optimizer.optimization_config.configs import TaskConfig, MetricConfig
14
15
  from opik_optimizer import base_optimizer
16
+ from opik_optimizer.optimization_config import mappers
15
17
 
16
- from . import prompt_parameter
17
- from . import prompt_templates
18
- from .. import _throttle
19
- from .. import optimization_result, task_evaluator
20
-
21
- import litellm
22
-
23
- from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
18
+ from .. import _throttle, optimization_result, task_evaluator, utils
19
+ from ..optimization_config import chat_prompt
20
+ from . import reporting
24
21
 
25
22
  _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
26
23
 
27
24
  logger = logging.getLogger(__name__)
28
25
 
26
+ FEW_SHOT_EXAMPLE_PLACEHOLDER = "FEW_SHOT_EXAMPLE_PLACEHOLDER"
27
+ SYSTEM_PROMPT_TEMPLATE = f"""
28
+ You are a prompt editor that modifies a message list to support few-shot learning. Your job is to insert a placeholder where few-shot examples can be inserted and generate a reusable string template for formatting those examples.
29
+
30
+ You will receive a JSON object with the following fields:
31
+
32
+ - "message_list": a list of messages, each with a role (system, user, or assistant) and a content field.
33
+ - "examples": a list of example pairs, each with input and output fields.
34
+
35
+ Your task:
36
+
37
+ - Insert the string "{FEW_SHOT_EXAMPLE_PLACEHOLDER}" into one of the messages in the list. Make sure to:
38
+ - Insert it at the most logical point for including few-shot examples — typically as part of the system message
39
+ - Add a section title in XML or markdown format. The examples will be provided as `example_1\nexample_2\n...` with each example following the example template.
40
+ - Analyze the examples to infer a consistent structure, and create a single string few_shot_example_template using the Python .format() style. Make sure to follow the following instructions:
41
+ - Unless absolutely relevant, do not return an object but instead a string that can be inserted as part of {FEW_SHOT_EXAMPLE_PLACEHOLDER}
42
+ - Make sure to include the variables as part of this string so we can before string formatting with actual examples. Only variables available in the examples can be used. Do not use anything else, do not apply any transformations to the variables either.
43
+ - The few shot examples should include the expected response as the goal is to provide examples of the expected output format.
44
+ - Ensure the format of the few shot examples are consistent with how the model will be called
45
+
46
+ Return your output as a JSON object with:
47
+
48
+ - message_list_with_placeholder: the updated list with "FEW_SHOT_EXAMPLE_PLACEHOLDER" inserted.
49
+ - example_template: a string template using the fields provided in the examples (you don't need to use all of them)
50
+
51
+ Respond only with the JSON object. Do not include any explanation or extra text.
52
+ """
53
+
54
+ class FewShotPromptTemplate(BaseModel):
55
+ message_list_with_placeholder: List[Dict[str, str]]
56
+ example_template: str
29
57
 
30
58
  class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
59
+ """
60
+ The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
61
+ employes a two stage pipeline:
62
+
63
+ 1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
64
+ provided
65
+ 2. We use Bayesian Optimization to determine the best examples to include in the prompt.
66
+
67
+ This algorithm is best used when you have a well defined task and would like to guide the LLM
68
+ by providing some examples.
69
+ """
31
70
  def __init__(
32
71
  self,
33
72
  model: str,
34
- project_name: Optional[str] = None,
73
+ project_name: Optional[str] = "Optimization",
35
74
  min_examples: int = 2,
36
75
  max_examples: int = 8,
37
76
  seed: int = 42,
38
77
  n_threads: int = 8,
39
- n_initial_prompts: int = 5,
40
- n_iterations: int = 10,
41
78
  verbose: int = 1,
42
79
  **model_kwargs,
43
80
  ) -> None:
81
+ """
82
+ Args:
83
+ model: The model to used to evaluate the prompt
84
+ project_name: Optional project name for tracking
85
+ min_examples: Minimum number of examples to include
86
+ max_examples: Maximum number of examples to include
87
+ seed: Random seed for reproducibility
88
+ n_threads: Number of threads for parallel evaluation
89
+ verbose: Controls internal logging/progress bars (0=off, 1=on).
90
+ **model_kwargs: Additional model parameters
91
+ """
44
92
  super().__init__(model, project_name, **model_kwargs)
45
93
  self.min_examples = min_examples
46
94
  self.max_examples = max_examples
47
95
  self.seed = seed
48
96
  self.n_threads = n_threads
49
- self.n_initial_prompts = n_initial_prompts
50
- self.n_iterations = n_iterations
51
97
  self.verbose = verbose
98
+ if verbose == 0:
99
+ logger.setLevel(logging.WARNING)
100
+ elif verbose == 1:
101
+ logger.setLevel(logging.INFO)
102
+ elif verbose == 2:
103
+ logger.setLevel(logging.DEBUG)
104
+
52
105
  self._opik_client = opik.Opik()
53
106
  self.llm_call_counter = 0
54
107
  logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
@@ -66,7 +119,6 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
66
119
  filtered_call_kwargs.pop('n_iterations', None)
67
120
  filtered_call_kwargs.pop('min_examples', None)
68
121
  filtered_call_kwargs.pop('max_examples', None)
69
- filtered_call_kwargs.pop('n_initial_prompts', None)
70
122
 
71
123
  final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
72
124
 
@@ -101,90 +153,80 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
101
153
  split_idx = int(len(dataset) * train_ratio)
102
154
  return dataset[:split_idx], dataset[split_idx:]
103
155
 
104
- def _optimize_prompt(
156
+ def _create_fewshot_prompt_template(
105
157
  self,
106
- dataset: Union[str, Dataset],
107
- metric_config: MetricConfig,
108
- task_config: TaskConfig,
158
+ model: str,
159
+ prompt: chat_prompt.ChatPrompt,
160
+ few_shot_examples: List[Dict[str, Any]]
161
+ ) -> FewShotPromptTemplate:
162
+ """
163
+ During this step we update the system prompt to include few-shot examples.
164
+ """
165
+ user_message = {
166
+ "message_list": prompt.formatted_messages,
167
+ "examples": few_shot_examples
168
+ }
169
+
170
+ messages: List[Dict[str, str]] = [
171
+ {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
172
+ {"role": "user", "content": json.dumps(user_message)},
173
+ ]
174
+
175
+ logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
176
+ response = self._call_model(
177
+ model,
178
+ messages,
179
+ self.seed,
180
+ self.model_kwargs
181
+ )
182
+ logger.debug(f"fewshot_prompt_template - LLM response: {response}")
183
+
184
+ try:
185
+ res = utils.json_to_dict(response["choices"][0]["message"]["content"])
186
+ return FewShotPromptTemplate(
187
+ message_list_with_placeholder=res["message_list_with_placeholder"],
188
+ example_template=res["example_template"]
189
+ )
190
+ except Exception as e:
191
+ logger.error(f"Failed to compute few-shot prompt template: {e} - response: {response}")
192
+ raise
193
+
194
+ def _run_optimization(
195
+ self,
196
+ fewshot_prompt_template: FewShotPromptTemplate,
197
+ dataset: Dataset,
198
+ metric: Callable,
109
199
  n_trials: int = 10,
200
+ baseline_score: Optional[float] = None,
110
201
  optimization_id: Optional[str] = None,
111
202
  experiment_config: Optional[Dict] = None,
112
- n_samples: int = None,
203
+ n_samples: Optional[int] = None,
113
204
  ) -> optimization_result.OptimizationResult:
205
+ reporting.start_optimization_run(verbose=self.verbose)
206
+
114
207
  random.seed(self.seed)
115
208
  self.llm_call_counter = 0
116
-
117
- if not task_config.use_chat_prompt:
118
- raise ValueError(
119
- "Few-shot Bayesian optimization is only supported for chat prompts."
120
- )
121
-
122
- opik_dataset: opik.Dataset = dataset
123
-
209
+
124
210
  # Load the dataset
125
- if isinstance(dataset, str):
126
- opik_dataset = self._opik_client.get_dataset(dataset)
127
- dataset_items = opik_dataset.get_items()
128
- else:
129
- opik_dataset = dataset
130
- dataset_items = opik_dataset.get_items()
131
-
211
+ dataset_items = dataset.get_items()
212
+ all_dataset_item_ids = [item["id"] for item in dataset_items]
213
+ eval_dataset_item_ids = all_dataset_item_ids
214
+ if n_samples is not None and n_samples < len(dataset_items):
215
+ eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
216
+
217
+ # Define the experiment configuration
132
218
  experiment_config = experiment_config or {}
133
219
  base_experiment_config = { # Base config for reuse
134
220
  **experiment_config,
135
221
  **{
136
222
  "optimizer": self.__class__.__name__,
137
- "metric": metric_config.metric.name,
138
- "dataset": opik_dataset.name,
223
+ "metric": metric.__name__,
224
+ "dataset": dataset.name,
139
225
  "configuration": {},
140
226
  },
141
227
  }
142
228
 
143
- # Evaluate Initial (Zero-Shot) Prompt
144
- logger.info("Evaluating initial (zero-shot) prompt...")
145
- initial_instruction = task_config.instruction_prompt
146
- zero_shot_param = prompt_parameter.ChatPromptParameter(
147
- name="zero_shot_prompt",
148
- instruction=initial_instruction,
149
- task_input_parameters=task_config.input_dataset_fields,
150
- task_output_parameter=task_config.output_dataset_field,
151
- demo_examples=[], # No examples
152
- )
153
- zero_shot_llm_task = self._build_task_from_prompt_template(
154
- zero_shot_param.as_template()
155
- )
156
-
157
- initial_eval_config = base_experiment_config.copy()
158
- initial_eval_config["configuration"]["prompt"] = initial_instruction
159
- initial_eval_config["configuration"]["n_examples"] = 0
160
-
161
- # Determine dataset item IDs for evaluation (initial and trials)
162
- all_dataset_item_ids = [item["id"] for item in dataset_items]
163
- eval_dataset_item_ids = all_dataset_item_ids
164
- if n_samples is not None and n_samples < len(all_dataset_item_ids):
165
- eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
166
- logger.info(f"Using {n_samples} samples for evaluations.")
167
- else:
168
- logger.info(
169
- f"Using all {len(all_dataset_item_ids)} samples for evaluations."
170
- )
171
-
172
- initial_score = task_evaluator.evaluate(
173
- dataset=opik_dataset,
174
- dataset_item_ids=eval_dataset_item_ids,
175
- metric_config=metric_config,
176
- evaluated_task=zero_shot_llm_task,
177
- num_threads=self.n_threads,
178
- project_name=self.project_name,
179
- experiment_config=initial_eval_config,
180
- optimization_id=optimization_id,
181
- verbose=self.verbose,
182
- )
183
- logger.info(f"Initial (zero-shot) score: {initial_score:.4f}")
184
-
185
229
  # Start Optuna Study
186
- logger.info("Starting Optuna study for Few-Shot Bayesian Optimization...")
187
-
188
230
  def optimization_objective(trial: optuna.Trial) -> float:
189
231
  n_examples = trial.suggest_int(
190
232
  "n_examples", self.min_examples, self.max_examples
@@ -197,7 +239,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
197
239
  ]
198
240
  trial.set_user_attr("example_indices", example_indices)
199
241
 
200
- instruction = task_config.instruction_prompt
242
+ # Process few shot examples
201
243
  demo_examples = [dataset_items[idx] for idx in example_indices]
202
244
 
203
245
  processed_demo_examples = []
@@ -205,21 +247,29 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
205
247
  processed_example = {}
206
248
  for key, value in example.items():
207
249
  processed_example[key] = str(value)
208
- processed_demo_examples.append(processed_example)
209
-
210
- param = prompt_parameter.ChatPromptParameter(
211
- name=f"trial_{trial.number}_prompt",
212
- instruction=instruction,
213
- task_input_parameters=task_config.input_dataset_fields,
214
- task_output_parameter=task_config.output_dataset_field,
215
- demo_examples=processed_demo_examples,
250
+
251
+ try:
252
+ processed_demo_examples.append(
253
+ fewshot_prompt_template.example_template.format(**processed_example)
254
+ )
255
+ except Exception as e:
256
+ logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
257
+ raise
258
+ few_shot_examples = "\n\n".join(processed_demo_examples)
259
+
260
+ llm_task = self._build_task_from_messages(
261
+ messages=fewshot_prompt_template.message_list_with_placeholder,
262
+ few_shot_examples=few_shot_examples
216
263
  )
217
264
 
218
- llm_task = self._build_task_from_prompt_template(param.as_template())
265
+ messages_for_reporting = [{
266
+ "role": item["role"],
267
+ "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
268
+ } for item in fewshot_prompt_template.message_list_with_placeholder]
219
269
 
220
270
  # Log trial config
221
271
  trial_config = base_experiment_config.copy()
222
- trial_config["configuration"]["prompt"] = instruction # Base instruction
272
+ trial_config["configuration"]["prompt"] = messages_for_reporting # Base instruction
223
273
  trial_config["configuration"][
224
274
  "examples"
225
275
  ] = processed_demo_examples # Log stringified examples
@@ -231,21 +281,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
231
281
  )
232
282
  logger.debug(f"Evaluating trial {trial.number}...")
233
283
 
234
- score = task_evaluator.evaluate(
235
- dataset=opik_dataset,
236
- dataset_item_ids=eval_dataset_item_ids,
237
- metric_config=metric_config,
238
- evaluated_task=llm_task,
239
- num_threads=self.n_threads,
240
- project_name=self.project_name,
241
- experiment_config=trial_config,
242
- optimization_id=optimization_id,
243
- verbose=self.verbose,
244
- )
284
+ with reporting.start_optimization_trial(trial.number, n_trials, verbose=self.verbose) as trial_reporter:
285
+ trial_reporter.start_trial(messages_for_reporting)
286
+ score = task_evaluator.evaluate(
287
+ dataset=dataset,
288
+ dataset_item_ids=eval_dataset_item_ids,
289
+ metric=metric,
290
+ evaluated_task=llm_task,
291
+ num_threads=self.n_threads,
292
+ project_name=self.project_name,
293
+ experiment_config=trial_config,
294
+ optimization_id=optimization_id,
295
+ verbose=self.verbose,
296
+ )
297
+ trial_reporter.set_score(baseline_score, score)
245
298
  logger.debug(f"Trial {trial.number} score: {score:.4f}")
246
299
 
300
+ # Trial results
301
+ trial_config = {
302
+ "demo_examples": demo_examples,
303
+ "message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
304
+ "message_list": messages
305
+ }
247
306
  trial.set_user_attr("score", score)
248
- trial.set_user_attr("param", param)
307
+ trial.set_user_attr("config", trial_config)
249
308
  return score
250
309
 
251
310
  # Configure Optuna Logging
@@ -265,29 +324,18 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
265
324
  sampler = optuna.samplers.TPESampler(seed=self.seed)
266
325
  study = optuna.create_study(direction="maximize", sampler=sampler)
267
326
 
268
- study.optimize(optimization_objective, n_trials=n_trials, show_progress_bar=(self.verbose >= 1))
269
- logger.info("Optuna study finished.")
270
-
327
+ study.optimize(
328
+ optimization_objective,
329
+ n_trials=n_trials,
330
+ show_progress_bar=False
331
+ )
332
+
271
333
  optuna_history_processed = []
272
334
  for trial_idx, trial in enumerate(study.trials):
273
335
  if trial.state == optuna.trial.TrialState.COMPLETE:
274
- param_obj: Optional[prompt_parameter.ChatPromptParameter] = trial.user_attrs.get("param")
275
- prompt_cand_display = None # Default to None
276
- if param_obj and hasattr(param_obj, 'as_template') and callable(param_obj.as_template):
277
- try:
278
- # .format() on ChatPromptTemplate returns the list of messages
279
- chat_messages_for_history = param_obj.as_template().format()
280
- prompt_cand_display = json.dumps(chat_messages_for_history)
281
- except Exception as e_param_format:
282
- logger.warning(f"Trial {trial.number}: Error formatting prompt from param_obj: {e_param_format}")
283
- prompt_cand_display = "Error: Could not format prompt content."
284
- elif not param_obj:
285
- logger.warning(f"Trial {trial.number}: 'param' object not found in user_attrs.")
286
- prompt_cand_display = "Error: Prompt data missing in trial."
287
- else:
288
- logger.warning(f"Trial {trial.number}: 'param' object is not of expected type or lacks methods.")
289
- prompt_cand_display = "Error: Invalid prompt data structure in trial."
290
-
336
+ trial_config = trial.user_attrs.get("config", {})
337
+ prompt_cand_display = trial_config.get('message_list') # Default to None
338
+
291
339
  score_val = trial.value # This can be None if trial failed to produce a score
292
340
  duration_val = None
293
341
  if trial.datetime_complete and trial.datetime_start:
@@ -298,16 +346,13 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
298
346
  "timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
299
347
  "prompt_candidate": prompt_cand_display,
300
348
  "parameters_used": {
301
- "optuna_params": trial.params,
349
+ "optuna_params": trial.user_attrs.get("config", {}),
302
350
  "example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
303
351
  },
304
352
  "scores": [{
305
- "metric_name": metric_config.metric.name,
353
+ "metric_name": metric.__name__,
306
354
  "score": score_val, # Can be None
307
- "opik_evaluation_id": None # TODO
308
355
  }],
309
- "tokens_used": None, # TODO
310
- "cost": None, # TODO
311
356
  "duration_seconds": duration_val,
312
357
  }
313
358
  optuna_history_processed.append(iter_detail)
@@ -316,33 +361,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
316
361
 
317
362
  best_trial = study.best_trial
318
363
  best_score = best_trial.value
319
- best_n_examples = best_trial.params["n_examples"]
320
364
  best_example_indices = best_trial.user_attrs.get("example_indices", [])
321
- best_param: prompt_parameter.ChatPromptParameter = best_trial.user_attrs[
322
- "param"
323
- ]
324
365
 
325
- chat_messages_list = best_param.as_template().format()
326
- main_prompt_string = best_param.instruction
366
+ reporting.display_result(
367
+ initial_score=baseline_score,
368
+ best_score=best_score,
369
+ best_prompt=best_trial.user_attrs["config"]["message_list"],
370
+ verbose=self.verbose
371
+ )
327
372
 
328
373
  return optimization_result.OptimizationResult(
329
374
  optimizer=self.__class__.__name__,
330
- prompt=main_prompt_string,
375
+ prompt=best_trial.user_attrs["config"]["message_list"],
331
376
  score=best_score,
332
- metric_name=metric_config.metric.name,
377
+ metric_name=metric.__name__,
333
378
  details={
334
- "prompt_type": "chat" if task_config.use_chat_prompt else "non-chat",
335
- "chat_messages": chat_messages_list,
336
- "prompt_parameter": best_param,
337
- "n_examples": best_n_examples,
379
+ "chat_messages": best_trial.user_attrs["config"]["message_list"],
380
+ "prompt_parameter": best_trial.user_attrs["config"],
381
+ #"n_examples": best_n_examples,
338
382
  "example_indices": best_example_indices,
339
383
  "trial_number": best_trial.number,
340
- "initial_score": initial_score,
341
384
  "total_trials": n_trials,
342
385
  "rounds": [],
343
386
  "stopped_early": False,
344
- "metric_config": metric_config.model_dump(),
345
- "task_config": task_config.model_dump(),
387
+ "metric_name": metric.__name__,
346
388
  "model": self.model,
347
389
  "temperature": self.model_kwargs.get("temperature"),
348
390
  },
@@ -350,20 +392,32 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
350
392
  llm_calls=self.llm_call_counter
351
393
  )
352
394
 
353
- def optimize_prompt(
395
+ def optimize_prompt( # type: ignore
354
396
  self,
355
- dataset: Union[str, Dataset],
356
- metric_config: MetricConfig,
357
- task_config: TaskConfig,
397
+ prompt: chat_prompt.ChatPrompt,
398
+ dataset: Dataset,
399
+ metric: Callable,
358
400
  n_trials: int = 10,
359
401
  experiment_config: Optional[Dict] = None,
360
- n_samples: int = None,
402
+ n_samples: Optional[int] = None,
361
403
  ) -> optimization_result.OptimizationResult:
404
+ """
405
+ Args:
406
+ prompt: The prompt to optimize
407
+ dataset: Opik Dataset to optimize on
408
+ metric: Metric function to evaluate on
409
+ n_trials: Number of trials for Bayesian Optimization
410
+ experiment_config: Optional configuration for the experiment, useful to log additional metadata
411
+ n_samples: Optional number of items to test in the dataset
412
+
413
+ Returns:
414
+ OptimizationResult: Result of the optimization
415
+ """
362
416
  optimization = None
363
417
  try:
364
418
  optimization = self._opik_client.create_optimization(
365
419
  dataset_name=dataset.name,
366
- objective_name=metric_config.metric.name,
420
+ objective_name=metric.__name__,
367
421
  metadata={"optimizer": self.__class__.__name__},
368
422
  )
369
423
  except Exception:
@@ -373,72 +427,109 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
373
427
  optimization = None
374
428
 
375
429
  try:
376
- result = self._optimize_prompt(
377
- optimization_id=optimization.id if optimization is not None else None,
430
+ # Start experiment reporting
431
+ reporting.display_header("Few-Shot Bayesian Optimizer", verbose=self.verbose)
432
+ reporting.display_configuration(
433
+ prompt.formatted_messages,
434
+ optimizer_config={
435
+ "optimizer": self.__class__.__name__,
436
+ "metric": metric.__name__,
437
+ "n_trials": n_trials,
438
+ "n_samples": n_samples
439
+ },
440
+ verbose=self.verbose
441
+ )
442
+
443
+ utils.disable_experiment_reporting()
444
+
445
+ # Step 1. Compute the baseline evaluation
446
+ with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
447
+ baseline_score = self.evaluate_prompt(
448
+ prompt=prompt,
449
+ dataset=dataset,
450
+ metric=metric,
451
+ n_samples=n_samples,
452
+ optimization_id=optimization.id if optimization is not None else None
453
+ )
454
+
455
+ eval_report.set_score(baseline_score)
456
+
457
+ # Step 2. Create the few-shot prompt template
458
+ with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
459
+ fewshot_template = self._create_fewshot_prompt_template(
460
+ model=self.model,
461
+ prompt=prompt,
462
+ few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
463
+ for item in dataset.get_items(nb_samples=10)]
464
+ )
465
+
466
+ fewshot_template_report.set_fewshot_template(fewshot_template)
467
+
468
+ # Step 3. Start the optimization process
469
+ result = self._run_optimization(
470
+ fewshot_prompt_template=fewshot_template,
378
471
  dataset=dataset,
379
- metric_config=metric_config,
380
- task_config=task_config,
381
- n_trials=n_trials,
472
+ metric=metric,
473
+ optimization_id=optimization.id if optimization is not None else None,
382
474
  experiment_config=experiment_config,
475
+ n_trials=n_trials,
476
+ baseline_score=baseline_score,
383
477
  n_samples=n_samples,
384
478
  )
385
479
  if optimization:
386
480
  self.update_optimization(optimization, status="completed")
481
+
482
+ utils.enable_experiment_reporting()
387
483
  return result
388
484
  except Exception as e:
389
485
  if optimization:
390
486
  self.update_optimization(optimization, status="cancelled")
391
487
  logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
488
+ utils.enable_experiment_reporting()
392
489
  raise e
393
490
 
394
491
  def evaluate_prompt(
395
492
  self,
396
- prompt: List[Dict[Literal["role", "content"], str]],
493
+ prompt: chat_prompt.ChatPrompt,
397
494
  dataset: opik.Dataset,
398
- metric_config: MetricConfig,
399
- task_config: Optional[TaskConfig] = None,
495
+ metric: Callable,
400
496
  dataset_item_ids: Optional[List[str]] = None,
401
497
  experiment_config: Optional[Dict] = None,
402
- n_samples: int = None,
498
+ optimization_id: Optional[str] = None,
499
+ n_samples: Optional[int] = None,
403
500
  ) -> float:
404
-
405
- if isinstance(prompt, str):
406
- if task_config is None:
407
- raise ValueError(
408
- "To use a string prompt, please pass in task_config to evaluate_prompt()"
409
- )
410
-
411
- questions = {
412
- field: ("{{%s}}" % field) for field in task_config.input_dataset_fields
413
- }
414
- prompt = [
415
- {"role": "system", "content": prompt},
416
- {"role": "user", "content": json.dumps(questions)},
417
- ]
418
-
501
+ """
502
+ Args:
503
+ prompt: The prompt to evaluate
504
+ dataset: Opik Dataset to evaluate the prompt on
505
+ metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
506
+ dataset_item_ids: Optional list of dataset item IDs to evaluate
507
+ experiment_config: Optional configuration for the experiment
508
+ optimization_id: Optional ID of the optimization
509
+ n_samples: Optional number of items to test in the dataset
510
+ Returns:
511
+ float: The evaluation score
512
+ """
419
513
  # Ensure prompt is correctly formatted
420
514
  if not all(
421
515
  isinstance(item, dict) and "role" in item and "content" in item
422
- for item in prompt
516
+ for item in prompt.formatted_messages
423
517
  ):
424
518
  raise ValueError(
425
519
  "A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
426
520
  )
427
521
 
428
- template = prompt_templates.ChatPromptTemplate(
429
- prompt, validate_placeholders=False
430
- )
431
- llm_task = self._build_task_from_prompt_template(template)
522
+ llm_task = self._build_task_from_messages(prompt.formatted_messages)
432
523
 
433
524
  experiment_config = experiment_config or {}
434
525
  experiment_config = {
435
526
  **experiment_config,
436
527
  **{
437
528
  "optimizer": self.__class__.__name__,
438
- "metric": metric_config.metric.name,
529
+ "metric": metric.__name__,
439
530
  "dataset": dataset.name,
440
531
  "configuration": {
441
- "examples": prompt,
532
+ "prompt": prompt.formatted_messages,
442
533
  },
443
534
  },
444
535
  }
@@ -450,27 +541,38 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
450
541
  all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
451
542
  dataset_item_ids = random.sample(all_ids, n_samples)
452
543
 
453
- logger.debug(f"Starting FewShotBayesian evaluation...")
544
+ logger.debug("Starting FewShotBayesian evaluation...")
454
545
  score = task_evaluator.evaluate(
455
546
  dataset=dataset,
456
547
  dataset_item_ids=dataset_item_ids,
457
- metric_config=metric_config,
548
+ metric=metric,
458
549
  evaluated_task=llm_task,
459
550
  num_threads=self.n_threads,
460
551
  project_name=self.project_name,
461
552
  experiment_config=experiment_config,
553
+ optimization_id=optimization_id,
462
554
  verbose=self.verbose,
463
555
  )
464
556
  logger.debug(f"Evaluation score: {score:.4f}")
465
557
 
466
558
  return score
467
559
 
468
- def _build_task_from_prompt_template(
469
- self, template: prompt_templates.ChatPromptTemplate
560
+
561
+ def _build_task_from_messages(
562
+ self, messages: List[Dict[str, str]], few_shot_examples: Optional[str] = None
470
563
  ):
471
564
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
472
- prompt_ = template.format(**dataset_item)
473
-
565
+ prompt_ = [{
566
+ "role": item["role"],
567
+ "content": item["content"].format(**dataset_item)
568
+ } for item in messages]
569
+
570
+ if few_shot_examples:
571
+ prompt_ = [{
572
+ "role": item["role"],
573
+ "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
574
+ } for item in prompt_]
575
+
474
576
  response = self._call_model(
475
577
  model=self.model,
476
578
  messages=prompt_,
@@ -482,4 +584,4 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
482
584
  mappers.EVALUATED_LLM_TASK_OUTPUT: response.choices[0].message.content
483
585
  }
484
586
 
485
- return llm_task
587
+ return llm_task, messages