opik-optimizer 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. opik_optimizer/__init__.py +15 -26
  2. opik_optimizer/base_optimizer.py +28 -44
  3. opik_optimizer/datasets/__init__.py +6 -7
  4. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
  5. opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
  6. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
  7. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
  8. opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
  9. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
  10. opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
  11. opik_optimizer/mipro_optimizer/__init__.py +1 -1
  12. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
  13. opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
  14. opik_optimizer/mipro_optimizer/utils.py +1 -23
  15. opik_optimizer/optimization_config/chat_prompt.py +106 -0
  16. opik_optimizer/optimization_config/configs.py +2 -21
  17. opik_optimizer/optimization_config/mappers.py +1 -1
  18. opik_optimizer/optimization_result.py +57 -85
  19. opik_optimizer/reporting_utils.py +180 -0
  20. opik_optimizer/task_evaluator.py +41 -26
  21. opik_optimizer/utils.py +187 -3
  22. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
  23. opik_optimizer-0.9.0.dist-info/RECORD +48 -0
  24. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
  25. opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
  26. opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
  27. opik_optimizer/integrations/__init__.py +0 -0
  28. opik_optimizer/meta_prompt_optimizer.py +0 -1151
  29. opik_optimizer-0.8.1.dist-info/RECORD +0 -45
  30. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
  31. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,54 +1,108 @@
1
+ import json
2
+ import logging
1
3
  import random
2
- from typing import Any, Dict, List, Tuple, Union, Optional, Literal
4
+ from datetime import datetime
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple
6
+
7
+ import litellm
3
8
  import opik
4
9
  import optuna
5
10
  import optuna.samplers
6
- import logging
7
- import json
8
- from datetime import datetime
9
-
10
11
  from opik import Dataset
11
- from opik_optimizer.optimization_config import mappers
12
+ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
13
+ from pydantic import BaseModel
12
14
 
13
- from opik_optimizer.optimization_config.configs import TaskConfig, MetricConfig
14
15
  from opik_optimizer import base_optimizer
16
+ from opik_optimizer.optimization_config import mappers
15
17
 
16
- from . import prompt_parameter
17
- from . import prompt_templates
18
- from .. import _throttle
19
- from .. import optimization_result, task_evaluator
20
-
21
- import litellm
22
-
23
- from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
18
+ from .. import _throttle, optimization_result, task_evaluator, utils
19
+ from ..optimization_config import chat_prompt
20
+ from . import reporting
24
21
 
25
22
  _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
26
23
 
27
24
  logger = logging.getLogger(__name__)
28
25
 
26
+ FEW_SHOT_EXAMPLE_PLACEHOLDER = "FEW_SHOT_EXAMPLE_PLACEHOLDER"
27
+ SYSTEM_PROMPT_TEMPLATE = f"""
28
+ You are a prompt editor that modifies a message list to support few-shot learning. Your job is to insert a placeholder where few-shot examples can be inserted and generate a reusable string template for formatting those examples.
29
+
30
+ You will receive a JSON object with the following fields:
31
+
32
+ - "message_list": a list of messages, each with a role (system, user, or assistant) and a content field.
33
+ - "examples": a list of example pairs, each with input and output fields.
34
+
35
+ Your task:
36
+
37
+ - Insert the string "{FEW_SHOT_EXAMPLE_PLACEHOLDER}" into one of the messages in the list. Make sure to:
38
+ - Insert it at the most logical point for including few-shot examples — typically as part of the system message
39
+ - Add a section title in XML or markdown format. The examples will be provided as `example_1\nexample_2\n...` with each example following the example template.
40
+ - Analyze the examples to infer a consistent structure, and create a single string few_shot_example_template using the Python .format() style. Make sure to follow the following instructions:
41
+ - Unless absolutely relevant, do not return an object but instead a string that can be inserted as part of {FEW_SHOT_EXAMPLE_PLACEHOLDER}
42
+ - Make sure to include the variables as part of this string so we can before string formatting with actual examples. Only variables available in the examples can be used.
43
+ - Do not apply any transformations to the variables either, only the variable name should be included in the format `{{<variable_name>}}`
44
+ - The few shot examples should include the expected response as the goal is to provide examples of the response.
45
+ - Ensure the format of the few shot examples are consistent with how the model will be called
46
+
47
+ Return your output as a JSON object with:
48
+
49
+ - message_list_with_placeholder: the updated list with "FEW_SHOT_EXAMPLE_PLACEHOLDER" inserted.
50
+ - example_template: a string template using the fields provided in the examples (you don't need to use all of them)
51
+
52
+ Respond only with the JSON object. Do not include any explanation or extra text.
53
+ """
54
+
55
+ class FewShotPromptTemplate(BaseModel):
56
+ message_list_with_placeholder: List[Dict[str, str]]
57
+ example_template: str
29
58
 
30
59
  class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
60
+ """
61
+ The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
62
+ employes a two stage pipeline:
63
+
64
+ 1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
65
+ provided
66
+ 2. We use Bayesian Optimization to determine the best examples to include in the prompt.
67
+
68
+ This algorithm is best used when you have a well defined task and would like to guide the LLM
69
+ by providing some examples.
70
+ """
31
71
  def __init__(
32
72
  self,
33
73
  model: str,
34
- project_name: Optional[str] = None,
74
+ project_name: Optional[str] = "Optimization",
35
75
  min_examples: int = 2,
36
76
  max_examples: int = 8,
37
77
  seed: int = 42,
38
78
  n_threads: int = 8,
39
- n_initial_prompts: int = 5,
40
- n_iterations: int = 10,
41
79
  verbose: int = 1,
42
80
  **model_kwargs,
43
81
  ) -> None:
82
+ """
83
+ Args:
84
+ model: The model to used to evaluate the prompt
85
+ project_name: Optional project name for tracking
86
+ min_examples: Minimum number of examples to include
87
+ max_examples: Maximum number of examples to include
88
+ seed: Random seed for reproducibility
89
+ n_threads: Number of threads for parallel evaluation
90
+ verbose: Controls internal logging/progress bars (0=off, 1=on).
91
+ **model_kwargs: Additional model parameters
92
+ """
44
93
  super().__init__(model, project_name, **model_kwargs)
45
94
  self.min_examples = min_examples
46
95
  self.max_examples = max_examples
47
96
  self.seed = seed
48
97
  self.n_threads = n_threads
49
- self.n_initial_prompts = n_initial_prompts
50
- self.n_iterations = n_iterations
51
98
  self.verbose = verbose
99
+ if verbose == 0:
100
+ logger.setLevel(logging.WARNING)
101
+ elif verbose == 1:
102
+ logger.setLevel(logging.INFO)
103
+ elif verbose == 2:
104
+ logger.setLevel(logging.DEBUG)
105
+
52
106
  self._opik_client = opik.Opik()
53
107
  self.llm_call_counter = 0
54
108
  logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
@@ -66,7 +120,6 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
66
120
  filtered_call_kwargs.pop('n_iterations', None)
67
121
  filtered_call_kwargs.pop('min_examples', None)
68
122
  filtered_call_kwargs.pop('max_examples', None)
69
- filtered_call_kwargs.pop('n_initial_prompts', None)
70
123
 
71
124
  final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
72
125
 
@@ -101,90 +154,80 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
101
154
  split_idx = int(len(dataset) * train_ratio)
102
155
  return dataset[:split_idx], dataset[split_idx:]
103
156
 
104
- def _optimize_prompt(
157
+ def _create_fewshot_prompt_template(
105
158
  self,
106
- dataset: Union[str, Dataset],
107
- metric_config: MetricConfig,
108
- task_config: TaskConfig,
159
+ model: str,
160
+ prompt: chat_prompt.ChatPrompt,
161
+ few_shot_examples: List[Dict[str, Any]]
162
+ ) -> FewShotPromptTemplate:
163
+ """
164
+ During this step we update the system prompt to include few-shot examples.
165
+ """
166
+ user_message = {
167
+ "message_list": prompt.formatted_messages,
168
+ "examples": few_shot_examples
169
+ }
170
+
171
+ messages: List[Dict[str, str]] = [
172
+ {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
173
+ {"role": "user", "content": json.dumps(user_message)},
174
+ ]
175
+
176
+ logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
177
+ response = self._call_model(
178
+ model,
179
+ messages,
180
+ self.seed,
181
+ self.model_kwargs
182
+ )
183
+ logger.debug(f"fewshot_prompt_template - LLM response: {response}")
184
+
185
+ try:
186
+ res = utils.json_to_dict(response["choices"][0]["message"]["content"])
187
+ return FewShotPromptTemplate(
188
+ message_list_with_placeholder=res["message_list_with_placeholder"],
189
+ example_template=res["example_template"]
190
+ )
191
+ except Exception as e:
192
+ logger.error(f"Failed to compute few-shot prompt template: {e} - response: {response}")
193
+ raise
194
+
195
+ def _run_optimization(
196
+ self,
197
+ fewshot_prompt_template: FewShotPromptTemplate,
198
+ dataset: Dataset,
199
+ metric: Callable,
109
200
  n_trials: int = 10,
201
+ baseline_score: Optional[float] = None,
110
202
  optimization_id: Optional[str] = None,
111
203
  experiment_config: Optional[Dict] = None,
112
- n_samples: int = None,
204
+ n_samples: Optional[int] = None,
113
205
  ) -> optimization_result.OptimizationResult:
206
+ reporting.start_optimization_run(verbose=self.verbose)
207
+
114
208
  random.seed(self.seed)
115
209
  self.llm_call_counter = 0
116
-
117
- if not task_config.use_chat_prompt:
118
- raise ValueError(
119
- "Few-shot Bayesian optimization is only supported for chat prompts."
120
- )
121
-
122
- opik_dataset: opik.Dataset = dataset
123
-
210
+
124
211
  # Load the dataset
125
- if isinstance(dataset, str):
126
- opik_dataset = self._opik_client.get_dataset(dataset)
127
- dataset_items = opik_dataset.get_items()
128
- else:
129
- opik_dataset = dataset
130
- dataset_items = opik_dataset.get_items()
131
-
212
+ dataset_items = dataset.get_items()
213
+ all_dataset_item_ids = [item["id"] for item in dataset_items]
214
+ eval_dataset_item_ids = all_dataset_item_ids
215
+ if n_samples is not None and n_samples < len(dataset_items):
216
+ eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
217
+
218
+ # Define the experiment configuration
132
219
  experiment_config = experiment_config or {}
133
220
  base_experiment_config = { # Base config for reuse
134
221
  **experiment_config,
135
222
  **{
136
223
  "optimizer": self.__class__.__name__,
137
- "metric": metric_config.metric.name,
138
- "dataset": opik_dataset.name,
224
+ "metric": metric.__name__,
225
+ "dataset": dataset.name,
139
226
  "configuration": {},
140
227
  },
141
228
  }
142
229
 
143
- # Evaluate Initial (Zero-Shot) Prompt
144
- logger.info("Evaluating initial (zero-shot) prompt...")
145
- initial_instruction = task_config.instruction_prompt
146
- zero_shot_param = prompt_parameter.ChatPromptParameter(
147
- name="zero_shot_prompt",
148
- instruction=initial_instruction,
149
- task_input_parameters=task_config.input_dataset_fields,
150
- task_output_parameter=task_config.output_dataset_field,
151
- demo_examples=[], # No examples
152
- )
153
- zero_shot_llm_task = self._build_task_from_prompt_template(
154
- zero_shot_param.as_template()
155
- )
156
-
157
- initial_eval_config = base_experiment_config.copy()
158
- initial_eval_config["configuration"]["prompt"] = initial_instruction
159
- initial_eval_config["configuration"]["n_examples"] = 0
160
-
161
- # Determine dataset item IDs for evaluation (initial and trials)
162
- all_dataset_item_ids = [item["id"] for item in dataset_items]
163
- eval_dataset_item_ids = all_dataset_item_ids
164
- if n_samples is not None and n_samples < len(all_dataset_item_ids):
165
- eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
166
- logger.info(f"Using {n_samples} samples for evaluations.")
167
- else:
168
- logger.info(
169
- f"Using all {len(all_dataset_item_ids)} samples for evaluations."
170
- )
171
-
172
- initial_score = task_evaluator.evaluate(
173
- dataset=opik_dataset,
174
- dataset_item_ids=eval_dataset_item_ids,
175
- metric_config=metric_config,
176
- evaluated_task=zero_shot_llm_task,
177
- num_threads=self.n_threads,
178
- project_name=self.project_name,
179
- experiment_config=initial_eval_config,
180
- optimization_id=optimization_id,
181
- verbose=self.verbose,
182
- )
183
- logger.info(f"Initial (zero-shot) score: {initial_score:.4f}")
184
-
185
230
  # Start Optuna Study
186
- logger.info("Starting Optuna study for Few-Shot Bayesian Optimization...")
187
-
188
231
  def optimization_objective(trial: optuna.Trial) -> float:
189
232
  n_examples = trial.suggest_int(
190
233
  "n_examples", self.min_examples, self.max_examples
@@ -197,7 +240,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
197
240
  ]
198
241
  trial.set_user_attr("example_indices", example_indices)
199
242
 
200
- instruction = task_config.instruction_prompt
243
+ # Process few shot examples
201
244
  demo_examples = [dataset_items[idx] for idx in example_indices]
202
245
 
203
246
  processed_demo_examples = []
@@ -205,21 +248,29 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
205
248
  processed_example = {}
206
249
  for key, value in example.items():
207
250
  processed_example[key] = str(value)
208
- processed_demo_examples.append(processed_example)
209
-
210
- param = prompt_parameter.ChatPromptParameter(
211
- name=f"trial_{trial.number}_prompt",
212
- instruction=instruction,
213
- task_input_parameters=task_config.input_dataset_fields,
214
- task_output_parameter=task_config.output_dataset_field,
215
- demo_examples=processed_demo_examples,
251
+
252
+ try:
253
+ processed_demo_examples.append(
254
+ fewshot_prompt_template.example_template.format(**processed_example)
255
+ )
256
+ except Exception:
257
+ logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
258
+ raise
259
+ few_shot_examples = "\n\n".join(processed_demo_examples)
260
+
261
+ llm_task = self._build_task_from_messages(
262
+ messages=fewshot_prompt_template.message_list_with_placeholder,
263
+ few_shot_examples=few_shot_examples
216
264
  )
217
265
 
218
- llm_task = self._build_task_from_prompt_template(param.as_template())
266
+ messages_for_reporting = [{
267
+ "role": item["role"],
268
+ "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
269
+ } for item in fewshot_prompt_template.message_list_with_placeholder]
219
270
 
220
271
  # Log trial config
221
272
  trial_config = base_experiment_config.copy()
222
- trial_config["configuration"]["prompt"] = instruction # Base instruction
273
+ trial_config["configuration"]["prompt"] = messages_for_reporting # Base instruction
223
274
  trial_config["configuration"][
224
275
  "examples"
225
276
  ] = processed_demo_examples # Log stringified examples
@@ -231,21 +282,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
231
282
  )
232
283
  logger.debug(f"Evaluating trial {trial.number}...")
233
284
 
234
- score = task_evaluator.evaluate(
235
- dataset=opik_dataset,
236
- dataset_item_ids=eval_dataset_item_ids,
237
- metric_config=metric_config,
238
- evaluated_task=llm_task,
239
- num_threads=self.n_threads,
240
- project_name=self.project_name,
241
- experiment_config=trial_config,
242
- optimization_id=optimization_id,
243
- verbose=self.verbose,
244
- )
285
+ with reporting.start_optimization_trial(trial.number, n_trials, verbose=self.verbose) as trial_reporter:
286
+ trial_reporter.start_trial(messages_for_reporting)
287
+ score = task_evaluator.evaluate(
288
+ dataset=dataset,
289
+ dataset_item_ids=eval_dataset_item_ids,
290
+ metric=metric,
291
+ evaluated_task=llm_task,
292
+ num_threads=self.n_threads,
293
+ project_name=self.project_name,
294
+ experiment_config=trial_config,
295
+ optimization_id=optimization_id,
296
+ verbose=self.verbose,
297
+ )
298
+ trial_reporter.set_score(baseline_score, score)
245
299
  logger.debug(f"Trial {trial.number} score: {score:.4f}")
246
300
 
301
+ # Trial results
302
+ trial_config = {
303
+ "demo_examples": demo_examples,
304
+ "message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
305
+ "message_list": messages_for_reporting
306
+ }
247
307
  trial.set_user_attr("score", score)
248
- trial.set_user_attr("param", param)
308
+ trial.set_user_attr("config", trial_config)
249
309
  return score
250
310
 
251
311
  # Configure Optuna Logging
@@ -265,29 +325,18 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
265
325
  sampler = optuna.samplers.TPESampler(seed=self.seed)
266
326
  study = optuna.create_study(direction="maximize", sampler=sampler)
267
327
 
268
- study.optimize(optimization_objective, n_trials=n_trials, show_progress_bar=(self.verbose >= 1))
269
- logger.info("Optuna study finished.")
270
-
328
+ study.optimize(
329
+ optimization_objective,
330
+ n_trials=n_trials,
331
+ show_progress_bar=False
332
+ )
333
+
271
334
  optuna_history_processed = []
272
335
  for trial_idx, trial in enumerate(study.trials):
273
336
  if trial.state == optuna.trial.TrialState.COMPLETE:
274
- param_obj: Optional[prompt_parameter.ChatPromptParameter] = trial.user_attrs.get("param")
275
- prompt_cand_display = None # Default to None
276
- if param_obj and hasattr(param_obj, 'as_template') and callable(param_obj.as_template):
277
- try:
278
- # .format() on ChatPromptTemplate returns the list of messages
279
- chat_messages_for_history = param_obj.as_template().format()
280
- prompt_cand_display = json.dumps(chat_messages_for_history)
281
- except Exception as e_param_format:
282
- logger.warning(f"Trial {trial.number}: Error formatting prompt from param_obj: {e_param_format}")
283
- prompt_cand_display = "Error: Could not format prompt content."
284
- elif not param_obj:
285
- logger.warning(f"Trial {trial.number}: 'param' object not found in user_attrs.")
286
- prompt_cand_display = "Error: Prompt data missing in trial."
287
- else:
288
- logger.warning(f"Trial {trial.number}: 'param' object is not of expected type or lacks methods.")
289
- prompt_cand_display = "Error: Invalid prompt data structure in trial."
290
-
337
+ trial_config = trial.user_attrs.get("config", {})
338
+ prompt_cand_display = trial_config.get('message_list') # Default to None
339
+
291
340
  score_val = trial.value # This can be None if trial failed to produce a score
292
341
  duration_val = None
293
342
  if trial.datetime_complete and trial.datetime_start:
@@ -298,16 +347,13 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
298
347
  "timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
299
348
  "prompt_candidate": prompt_cand_display,
300
349
  "parameters_used": {
301
- "optuna_params": trial.params,
350
+ "optuna_params": trial.user_attrs.get("config", {}),
302
351
  "example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
303
352
  },
304
353
  "scores": [{
305
- "metric_name": metric_config.metric.name,
354
+ "metric_name": metric.__name__,
306
355
  "score": score_val, # Can be None
307
- "opik_evaluation_id": None # TODO
308
356
  }],
309
- "tokens_used": None, # TODO
310
- "cost": None, # TODO
311
357
  "duration_seconds": duration_val,
312
358
  }
313
359
  optuna_history_processed.append(iter_detail)
@@ -316,33 +362,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
316
362
 
317
363
  best_trial = study.best_trial
318
364
  best_score = best_trial.value
319
- best_n_examples = best_trial.params["n_examples"]
320
365
  best_example_indices = best_trial.user_attrs.get("example_indices", [])
321
- best_param: prompt_parameter.ChatPromptParameter = best_trial.user_attrs[
322
- "param"
323
- ]
324
366
 
325
- chat_messages_list = best_param.as_template().format()
326
- main_prompt_string = best_param.instruction
367
+ reporting.display_result(
368
+ initial_score=baseline_score,
369
+ best_score=best_score,
370
+ best_prompt=best_trial.user_attrs["config"]["message_list"],
371
+ verbose=self.verbose
372
+ )
327
373
 
328
374
  return optimization_result.OptimizationResult(
329
375
  optimizer=self.__class__.__name__,
330
- prompt=main_prompt_string,
376
+ prompt=best_trial.user_attrs["config"]["message_list"],
331
377
  score=best_score,
332
- metric_name=metric_config.metric.name,
378
+ metric_name=metric.__name__,
333
379
  details={
334
- "prompt_type": "chat" if task_config.use_chat_prompt else "non-chat",
335
- "chat_messages": chat_messages_list,
336
- "prompt_parameter": best_param,
337
- "n_examples": best_n_examples,
380
+ "chat_messages": best_trial.user_attrs["config"]["message_list"],
381
+ "prompt_parameter": best_trial.user_attrs["config"],
382
+ #"n_examples": best_n_examples,
338
383
  "example_indices": best_example_indices,
339
384
  "trial_number": best_trial.number,
340
- "initial_score": initial_score,
341
385
  "total_trials": n_trials,
342
386
  "rounds": [],
343
387
  "stopped_early": False,
344
- "metric_config": metric_config.model_dump(),
345
- "task_config": task_config.model_dump(),
388
+ "metric_name": metric.__name__,
346
389
  "model": self.model,
347
390
  "temperature": self.model_kwargs.get("temperature"),
348
391
  },
@@ -350,20 +393,32 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
350
393
  llm_calls=self.llm_call_counter
351
394
  )
352
395
 
353
- def optimize_prompt(
396
+ def optimize_prompt( # type: ignore
354
397
  self,
355
- dataset: Union[str, Dataset],
356
- metric_config: MetricConfig,
357
- task_config: TaskConfig,
398
+ prompt: chat_prompt.ChatPrompt,
399
+ dataset: Dataset,
400
+ metric: Callable,
358
401
  n_trials: int = 10,
359
402
  experiment_config: Optional[Dict] = None,
360
- n_samples: int = None,
403
+ n_samples: Optional[int] = None,
361
404
  ) -> optimization_result.OptimizationResult:
405
+ """
406
+ Args:
407
+ prompt: The prompt to optimize
408
+ dataset: Opik Dataset to optimize on
409
+ metric: Metric function to evaluate on
410
+ n_trials: Number of trials for Bayesian Optimization
411
+ experiment_config: Optional configuration for the experiment, useful to log additional metadata
412
+ n_samples: Optional number of items to test in the dataset
413
+
414
+ Returns:
415
+ OptimizationResult: Result of the optimization
416
+ """
362
417
  optimization = None
363
418
  try:
364
419
  optimization = self._opik_client.create_optimization(
365
420
  dataset_name=dataset.name,
366
- objective_name=metric_config.metric.name,
421
+ objective_name=metric.__name__,
367
422
  metadata={"optimizer": self.__class__.__name__},
368
423
  )
369
424
  except Exception:
@@ -373,72 +428,109 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
373
428
  optimization = None
374
429
 
375
430
  try:
376
- result = self._optimize_prompt(
377
- optimization_id=optimization.id if optimization is not None else None,
431
+ # Start experiment reporting
432
+ reporting.display_header("Few-Shot Bayesian Optimizer", verbose=self.verbose)
433
+ reporting.display_configuration(
434
+ prompt.formatted_messages,
435
+ optimizer_config={
436
+ "optimizer": self.__class__.__name__,
437
+ "metric": metric.__name__,
438
+ "n_trials": n_trials,
439
+ "n_samples": n_samples
440
+ },
441
+ verbose=self.verbose
442
+ )
443
+
444
+ utils.disable_experiment_reporting()
445
+
446
+ # Step 1. Compute the baseline evaluation
447
+ with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
448
+ baseline_score = self.evaluate_prompt(
449
+ prompt=prompt,
450
+ dataset=dataset,
451
+ metric=metric,
452
+ n_samples=n_samples,
453
+ optimization_id=optimization.id if optimization is not None else None
454
+ )
455
+
456
+ eval_report.set_score(baseline_score)
457
+
458
+ # Step 2. Create the few-shot prompt template
459
+ with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
460
+ fewshot_template = self._create_fewshot_prompt_template(
461
+ model=self.model,
462
+ prompt=prompt,
463
+ few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
464
+ for item in dataset.get_items(nb_samples=10)]
465
+ )
466
+
467
+ fewshot_template_report.set_fewshot_template(fewshot_template)
468
+
469
+ # Step 3. Start the optimization process
470
+ result = self._run_optimization(
471
+ fewshot_prompt_template=fewshot_template,
378
472
  dataset=dataset,
379
- metric_config=metric_config,
380
- task_config=task_config,
381
- n_trials=n_trials,
473
+ metric=metric,
474
+ optimization_id=optimization.id if optimization is not None else None,
382
475
  experiment_config=experiment_config,
476
+ n_trials=n_trials,
477
+ baseline_score=baseline_score,
383
478
  n_samples=n_samples,
384
479
  )
385
480
  if optimization:
386
481
  self.update_optimization(optimization, status="completed")
482
+
483
+ utils.enable_experiment_reporting()
387
484
  return result
388
485
  except Exception as e:
389
486
  if optimization:
390
487
  self.update_optimization(optimization, status="cancelled")
391
488
  logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
489
+ utils.enable_experiment_reporting()
392
490
  raise e
393
491
 
394
492
  def evaluate_prompt(
395
493
  self,
396
- prompt: List[Dict[Literal["role", "content"], str]],
494
+ prompt: chat_prompt.ChatPrompt,
397
495
  dataset: opik.Dataset,
398
- metric_config: MetricConfig,
399
- task_config: Optional[TaskConfig] = None,
496
+ metric: Callable,
400
497
  dataset_item_ids: Optional[List[str]] = None,
401
498
  experiment_config: Optional[Dict] = None,
402
- n_samples: int = None,
499
+ optimization_id: Optional[str] = None,
500
+ n_samples: Optional[int] = None,
403
501
  ) -> float:
404
-
405
- if isinstance(prompt, str):
406
- if task_config is None:
407
- raise ValueError(
408
- "To use a string prompt, please pass in task_config to evaluate_prompt()"
409
- )
410
-
411
- questions = {
412
- field: ("{{%s}}" % field) for field in task_config.input_dataset_fields
413
- }
414
- prompt = [
415
- {"role": "system", "content": prompt},
416
- {"role": "user", "content": json.dumps(questions)},
417
- ]
418
-
502
+ """
503
+ Args:
504
+ prompt: The prompt to evaluate
505
+ dataset: Opik Dataset to evaluate the prompt on
506
+ metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
507
+ dataset_item_ids: Optional list of dataset item IDs to evaluate
508
+ experiment_config: Optional configuration for the experiment
509
+ optimization_id: Optional ID of the optimization
510
+ n_samples: Optional number of items to test in the dataset
511
+ Returns:
512
+ float: The evaluation score
513
+ """
419
514
  # Ensure prompt is correctly formatted
420
515
  if not all(
421
516
  isinstance(item, dict) and "role" in item and "content" in item
422
- for item in prompt
517
+ for item in prompt.formatted_messages
423
518
  ):
424
519
  raise ValueError(
425
520
  "A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
426
521
  )
427
522
 
428
- template = prompt_templates.ChatPromptTemplate(
429
- prompt, validate_placeholders=False
430
- )
431
- llm_task = self._build_task_from_prompt_template(template)
523
+ llm_task = self._build_task_from_messages(prompt.formatted_messages)
432
524
 
433
525
  experiment_config = experiment_config or {}
434
526
  experiment_config = {
435
527
  **experiment_config,
436
528
  **{
437
529
  "optimizer": self.__class__.__name__,
438
- "metric": metric_config.metric.name,
530
+ "metric": metric.__name__,
439
531
  "dataset": dataset.name,
440
532
  "configuration": {
441
- "examples": prompt,
533
+ "prompt": prompt.formatted_messages,
442
534
  },
443
535
  },
444
536
  }
@@ -450,27 +542,39 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
450
542
  all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
451
543
  dataset_item_ids = random.sample(all_ids, n_samples)
452
544
 
453
- logger.debug(f"Starting FewShotBayesian evaluation...")
545
+ logger.debug("Starting FewShotBayesian evaluation...")
454
546
  score = task_evaluator.evaluate(
455
547
  dataset=dataset,
456
548
  dataset_item_ids=dataset_item_ids,
457
- metric_config=metric_config,
549
+ metric=metric,
458
550
  evaluated_task=llm_task,
459
551
  num_threads=self.n_threads,
460
552
  project_name=self.project_name,
461
553
  experiment_config=experiment_config,
554
+ optimization_id=optimization_id,
462
555
  verbose=self.verbose,
463
556
  )
464
557
  logger.debug(f"Evaluation score: {score:.4f}")
465
558
 
466
559
  return score
467
560
 
468
- def _build_task_from_prompt_template(
469
- self, template: prompt_templates.ChatPromptTemplate
561
+
562
+ def _build_task_from_messages(
563
+ self, messages: List[Dict[str, str]], few_shot_examples: Optional[str] = None
470
564
  ):
471
565
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
472
- prompt_ = template.format(**dataset_item)
473
-
566
+ for key, value in dataset_item.items():
567
+ prompt_ = [{
568
+ "role": item["role"],
569
+ "content": item["content"].replace("{" + key + "}", str(value))
570
+ } for item in messages]
571
+
572
+ if few_shot_examples:
573
+ prompt_ = [{
574
+ "role": item["role"],
575
+ "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
576
+ } for item in prompt_]
577
+
474
578
  response = self._call_model(
475
579
  model=self.model,
476
580
  messages=prompt_,