opik-optimizer 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +15 -26
- opik_optimizer/base_optimizer.py +28 -44
- opik_optimizer/datasets/__init__.py +6 -7
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
- opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
- opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
- opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
- opik_optimizer/mipro_optimizer/__init__.py +1 -1
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
- opik_optimizer/mipro_optimizer/utils.py +1 -23
- opik_optimizer/optimization_config/chat_prompt.py +106 -0
- opik_optimizer/optimization_config/configs.py +2 -21
- opik_optimizer/optimization_config/mappers.py +1 -1
- opik_optimizer/optimization_result.py +57 -85
- opik_optimizer/reporting_utils.py +180 -0
- opik_optimizer/task_evaluator.py +41 -26
- opik_optimizer/utils.py +187 -3
- {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
- opik_optimizer-0.9.0.dist-info/RECORD +48 -0
- {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
- opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
- opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
- opik_optimizer/integrations/__init__.py +0 -0
- opik_optimizer/meta_prompt_optimizer.py +0 -1151
- opik_optimizer-0.8.1.dist-info/RECORD +0 -45
- {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,54 +1,108 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
1
3
|
import random
|
2
|
-
from
|
4
|
+
from datetime import datetime
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
6
|
+
|
7
|
+
import litellm
|
3
8
|
import opik
|
4
9
|
import optuna
|
5
10
|
import optuna.samplers
|
6
|
-
import logging
|
7
|
-
import json
|
8
|
-
from datetime import datetime
|
9
|
-
|
10
11
|
from opik import Dataset
|
11
|
-
from
|
12
|
+
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
13
|
+
from pydantic import BaseModel
|
12
14
|
|
13
|
-
from opik_optimizer.optimization_config.configs import TaskConfig, MetricConfig
|
14
15
|
from opik_optimizer import base_optimizer
|
16
|
+
from opik_optimizer.optimization_config import mappers
|
15
17
|
|
16
|
-
from
|
17
|
-
from
|
18
|
-
from
|
19
|
-
from .. import optimization_result, task_evaluator
|
20
|
-
|
21
|
-
import litellm
|
22
|
-
|
23
|
-
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
18
|
+
from .. import _throttle, optimization_result, task_evaluator, utils
|
19
|
+
from ..optimization_config import chat_prompt
|
20
|
+
from . import reporting
|
24
21
|
|
25
22
|
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
26
23
|
|
27
24
|
logger = logging.getLogger(__name__)
|
28
25
|
|
26
|
+
FEW_SHOT_EXAMPLE_PLACEHOLDER = "FEW_SHOT_EXAMPLE_PLACEHOLDER"
|
27
|
+
SYSTEM_PROMPT_TEMPLATE = f"""
|
28
|
+
You are a prompt editor that modifies a message list to support few-shot learning. Your job is to insert a placeholder where few-shot examples can be inserted and generate a reusable string template for formatting those examples.
|
29
|
+
|
30
|
+
You will receive a JSON object with the following fields:
|
31
|
+
|
32
|
+
- "message_list": a list of messages, each with a role (system, user, or assistant) and a content field.
|
33
|
+
- "examples": a list of example pairs, each with input and output fields.
|
34
|
+
|
35
|
+
Your task:
|
36
|
+
|
37
|
+
- Insert the string "{FEW_SHOT_EXAMPLE_PLACEHOLDER}" into one of the messages in the list. Make sure to:
|
38
|
+
- Insert it at the most logical point for including few-shot examples — typically as part of the system message
|
39
|
+
- Add a section title in XML or markdown format. The examples will be provided as `example_1\nexample_2\n...` with each example following the example template.
|
40
|
+
- Analyze the examples to infer a consistent structure, and create a single string few_shot_example_template using the Python .format() style. Make sure to follow the following instructions:
|
41
|
+
- Unless absolutely relevant, do not return an object but instead a string that can be inserted as part of {FEW_SHOT_EXAMPLE_PLACEHOLDER}
|
42
|
+
- Make sure to include the variables as part of this string so we can before string formatting with actual examples. Only variables available in the examples can be used.
|
43
|
+
- Do not apply any transformations to the variables either, only the variable name should be included in the format `{{<variable_name>}}`
|
44
|
+
- The few shot examples should include the expected response as the goal is to provide examples of the response.
|
45
|
+
- Ensure the format of the few shot examples are consistent with how the model will be called
|
46
|
+
|
47
|
+
Return your output as a JSON object with:
|
48
|
+
|
49
|
+
- message_list_with_placeholder: the updated list with "FEW_SHOT_EXAMPLE_PLACEHOLDER" inserted.
|
50
|
+
- example_template: a string template using the fields provided in the examples (you don't need to use all of them)
|
51
|
+
|
52
|
+
Respond only with the JSON object. Do not include any explanation or extra text.
|
53
|
+
"""
|
54
|
+
|
55
|
+
class FewShotPromptTemplate(BaseModel):
|
56
|
+
message_list_with_placeholder: List[Dict[str, str]]
|
57
|
+
example_template: str
|
29
58
|
|
30
59
|
class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
60
|
+
"""
|
61
|
+
The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
|
62
|
+
employes a two stage pipeline:
|
63
|
+
|
64
|
+
1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
|
65
|
+
provided
|
66
|
+
2. We use Bayesian Optimization to determine the best examples to include in the prompt.
|
67
|
+
|
68
|
+
This algorithm is best used when you have a well defined task and would like to guide the LLM
|
69
|
+
by providing some examples.
|
70
|
+
"""
|
31
71
|
def __init__(
|
32
72
|
self,
|
33
73
|
model: str,
|
34
|
-
project_name: Optional[str] =
|
74
|
+
project_name: Optional[str] = "Optimization",
|
35
75
|
min_examples: int = 2,
|
36
76
|
max_examples: int = 8,
|
37
77
|
seed: int = 42,
|
38
78
|
n_threads: int = 8,
|
39
|
-
n_initial_prompts: int = 5,
|
40
|
-
n_iterations: int = 10,
|
41
79
|
verbose: int = 1,
|
42
80
|
**model_kwargs,
|
43
81
|
) -> None:
|
82
|
+
"""
|
83
|
+
Args:
|
84
|
+
model: The model to used to evaluate the prompt
|
85
|
+
project_name: Optional project name for tracking
|
86
|
+
min_examples: Minimum number of examples to include
|
87
|
+
max_examples: Maximum number of examples to include
|
88
|
+
seed: Random seed for reproducibility
|
89
|
+
n_threads: Number of threads for parallel evaluation
|
90
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
91
|
+
**model_kwargs: Additional model parameters
|
92
|
+
"""
|
44
93
|
super().__init__(model, project_name, **model_kwargs)
|
45
94
|
self.min_examples = min_examples
|
46
95
|
self.max_examples = max_examples
|
47
96
|
self.seed = seed
|
48
97
|
self.n_threads = n_threads
|
49
|
-
self.n_initial_prompts = n_initial_prompts
|
50
|
-
self.n_iterations = n_iterations
|
51
98
|
self.verbose = verbose
|
99
|
+
if verbose == 0:
|
100
|
+
logger.setLevel(logging.WARNING)
|
101
|
+
elif verbose == 1:
|
102
|
+
logger.setLevel(logging.INFO)
|
103
|
+
elif verbose == 2:
|
104
|
+
logger.setLevel(logging.DEBUG)
|
105
|
+
|
52
106
|
self._opik_client = opik.Opik()
|
53
107
|
self.llm_call_counter = 0
|
54
108
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
@@ -66,7 +120,6 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
66
120
|
filtered_call_kwargs.pop('n_iterations', None)
|
67
121
|
filtered_call_kwargs.pop('min_examples', None)
|
68
122
|
filtered_call_kwargs.pop('max_examples', None)
|
69
|
-
filtered_call_kwargs.pop('n_initial_prompts', None)
|
70
123
|
|
71
124
|
final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
72
125
|
|
@@ -101,90 +154,80 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
101
154
|
split_idx = int(len(dataset) * train_ratio)
|
102
155
|
return dataset[:split_idx], dataset[split_idx:]
|
103
156
|
|
104
|
-
def
|
157
|
+
def _create_fewshot_prompt_template(
|
105
158
|
self,
|
106
|
-
|
107
|
-
|
108
|
-
|
159
|
+
model: str,
|
160
|
+
prompt: chat_prompt.ChatPrompt,
|
161
|
+
few_shot_examples: List[Dict[str, Any]]
|
162
|
+
) -> FewShotPromptTemplate:
|
163
|
+
"""
|
164
|
+
During this step we update the system prompt to include few-shot examples.
|
165
|
+
"""
|
166
|
+
user_message = {
|
167
|
+
"message_list": prompt.formatted_messages,
|
168
|
+
"examples": few_shot_examples
|
169
|
+
}
|
170
|
+
|
171
|
+
messages: List[Dict[str, str]] = [
|
172
|
+
{"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
|
173
|
+
{"role": "user", "content": json.dumps(user_message)},
|
174
|
+
]
|
175
|
+
|
176
|
+
logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
|
177
|
+
response = self._call_model(
|
178
|
+
model,
|
179
|
+
messages,
|
180
|
+
self.seed,
|
181
|
+
self.model_kwargs
|
182
|
+
)
|
183
|
+
logger.debug(f"fewshot_prompt_template - LLM response: {response}")
|
184
|
+
|
185
|
+
try:
|
186
|
+
res = utils.json_to_dict(response["choices"][0]["message"]["content"])
|
187
|
+
return FewShotPromptTemplate(
|
188
|
+
message_list_with_placeholder=res["message_list_with_placeholder"],
|
189
|
+
example_template=res["example_template"]
|
190
|
+
)
|
191
|
+
except Exception as e:
|
192
|
+
logger.error(f"Failed to compute few-shot prompt template: {e} - response: {response}")
|
193
|
+
raise
|
194
|
+
|
195
|
+
def _run_optimization(
|
196
|
+
self,
|
197
|
+
fewshot_prompt_template: FewShotPromptTemplate,
|
198
|
+
dataset: Dataset,
|
199
|
+
metric: Callable,
|
109
200
|
n_trials: int = 10,
|
201
|
+
baseline_score: Optional[float] = None,
|
110
202
|
optimization_id: Optional[str] = None,
|
111
203
|
experiment_config: Optional[Dict] = None,
|
112
|
-
n_samples: int = None,
|
204
|
+
n_samples: Optional[int] = None,
|
113
205
|
) -> optimization_result.OptimizationResult:
|
206
|
+
reporting.start_optimization_run(verbose=self.verbose)
|
207
|
+
|
114
208
|
random.seed(self.seed)
|
115
209
|
self.llm_call_counter = 0
|
116
|
-
|
117
|
-
if not task_config.use_chat_prompt:
|
118
|
-
raise ValueError(
|
119
|
-
"Few-shot Bayesian optimization is only supported for chat prompts."
|
120
|
-
)
|
121
|
-
|
122
|
-
opik_dataset: opik.Dataset = dataset
|
123
|
-
|
210
|
+
|
124
211
|
# Load the dataset
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
212
|
+
dataset_items = dataset.get_items()
|
213
|
+
all_dataset_item_ids = [item["id"] for item in dataset_items]
|
214
|
+
eval_dataset_item_ids = all_dataset_item_ids
|
215
|
+
if n_samples is not None and n_samples < len(dataset_items):
|
216
|
+
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
217
|
+
|
218
|
+
# Define the experiment configuration
|
132
219
|
experiment_config = experiment_config or {}
|
133
220
|
base_experiment_config = { # Base config for reuse
|
134
221
|
**experiment_config,
|
135
222
|
**{
|
136
223
|
"optimizer": self.__class__.__name__,
|
137
|
-
"metric":
|
138
|
-
"dataset":
|
224
|
+
"metric": metric.__name__,
|
225
|
+
"dataset": dataset.name,
|
139
226
|
"configuration": {},
|
140
227
|
},
|
141
228
|
}
|
142
229
|
|
143
|
-
# Evaluate Initial (Zero-Shot) Prompt
|
144
|
-
logger.info("Evaluating initial (zero-shot) prompt...")
|
145
|
-
initial_instruction = task_config.instruction_prompt
|
146
|
-
zero_shot_param = prompt_parameter.ChatPromptParameter(
|
147
|
-
name="zero_shot_prompt",
|
148
|
-
instruction=initial_instruction,
|
149
|
-
task_input_parameters=task_config.input_dataset_fields,
|
150
|
-
task_output_parameter=task_config.output_dataset_field,
|
151
|
-
demo_examples=[], # No examples
|
152
|
-
)
|
153
|
-
zero_shot_llm_task = self._build_task_from_prompt_template(
|
154
|
-
zero_shot_param.as_template()
|
155
|
-
)
|
156
|
-
|
157
|
-
initial_eval_config = base_experiment_config.copy()
|
158
|
-
initial_eval_config["configuration"]["prompt"] = initial_instruction
|
159
|
-
initial_eval_config["configuration"]["n_examples"] = 0
|
160
|
-
|
161
|
-
# Determine dataset item IDs for evaluation (initial and trials)
|
162
|
-
all_dataset_item_ids = [item["id"] for item in dataset_items]
|
163
|
-
eval_dataset_item_ids = all_dataset_item_ids
|
164
|
-
if n_samples is not None and n_samples < len(all_dataset_item_ids):
|
165
|
-
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
166
|
-
logger.info(f"Using {n_samples} samples for evaluations.")
|
167
|
-
else:
|
168
|
-
logger.info(
|
169
|
-
f"Using all {len(all_dataset_item_ids)} samples for evaluations."
|
170
|
-
)
|
171
|
-
|
172
|
-
initial_score = task_evaluator.evaluate(
|
173
|
-
dataset=opik_dataset,
|
174
|
-
dataset_item_ids=eval_dataset_item_ids,
|
175
|
-
metric_config=metric_config,
|
176
|
-
evaluated_task=zero_shot_llm_task,
|
177
|
-
num_threads=self.n_threads,
|
178
|
-
project_name=self.project_name,
|
179
|
-
experiment_config=initial_eval_config,
|
180
|
-
optimization_id=optimization_id,
|
181
|
-
verbose=self.verbose,
|
182
|
-
)
|
183
|
-
logger.info(f"Initial (zero-shot) score: {initial_score:.4f}")
|
184
|
-
|
185
230
|
# Start Optuna Study
|
186
|
-
logger.info("Starting Optuna study for Few-Shot Bayesian Optimization...")
|
187
|
-
|
188
231
|
def optimization_objective(trial: optuna.Trial) -> float:
|
189
232
|
n_examples = trial.suggest_int(
|
190
233
|
"n_examples", self.min_examples, self.max_examples
|
@@ -197,7 +240,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
197
240
|
]
|
198
241
|
trial.set_user_attr("example_indices", example_indices)
|
199
242
|
|
200
|
-
|
243
|
+
# Process few shot examples
|
201
244
|
demo_examples = [dataset_items[idx] for idx in example_indices]
|
202
245
|
|
203
246
|
processed_demo_examples = []
|
@@ -205,21 +248,29 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
205
248
|
processed_example = {}
|
206
249
|
for key, value in example.items():
|
207
250
|
processed_example[key] = str(value)
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
251
|
+
|
252
|
+
try:
|
253
|
+
processed_demo_examples.append(
|
254
|
+
fewshot_prompt_template.example_template.format(**processed_example)
|
255
|
+
)
|
256
|
+
except Exception:
|
257
|
+
logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
|
258
|
+
raise
|
259
|
+
few_shot_examples = "\n\n".join(processed_demo_examples)
|
260
|
+
|
261
|
+
llm_task = self._build_task_from_messages(
|
262
|
+
messages=fewshot_prompt_template.message_list_with_placeholder,
|
263
|
+
few_shot_examples=few_shot_examples
|
216
264
|
)
|
217
265
|
|
218
|
-
|
266
|
+
messages_for_reporting = [{
|
267
|
+
"role": item["role"],
|
268
|
+
"content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
|
269
|
+
} for item in fewshot_prompt_template.message_list_with_placeholder]
|
219
270
|
|
220
271
|
# Log trial config
|
221
272
|
trial_config = base_experiment_config.copy()
|
222
|
-
trial_config["configuration"]["prompt"] =
|
273
|
+
trial_config["configuration"]["prompt"] = messages_for_reporting # Base instruction
|
223
274
|
trial_config["configuration"][
|
224
275
|
"examples"
|
225
276
|
] = processed_demo_examples # Log stringified examples
|
@@ -231,21 +282,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
231
282
|
)
|
232
283
|
logger.debug(f"Evaluating trial {trial.number}...")
|
233
284
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
285
|
+
with reporting.start_optimization_trial(trial.number, n_trials, verbose=self.verbose) as trial_reporter:
|
286
|
+
trial_reporter.start_trial(messages_for_reporting)
|
287
|
+
score = task_evaluator.evaluate(
|
288
|
+
dataset=dataset,
|
289
|
+
dataset_item_ids=eval_dataset_item_ids,
|
290
|
+
metric=metric,
|
291
|
+
evaluated_task=llm_task,
|
292
|
+
num_threads=self.n_threads,
|
293
|
+
project_name=self.project_name,
|
294
|
+
experiment_config=trial_config,
|
295
|
+
optimization_id=optimization_id,
|
296
|
+
verbose=self.verbose,
|
297
|
+
)
|
298
|
+
trial_reporter.set_score(baseline_score, score)
|
245
299
|
logger.debug(f"Trial {trial.number} score: {score:.4f}")
|
246
300
|
|
301
|
+
# Trial results
|
302
|
+
trial_config = {
|
303
|
+
"demo_examples": demo_examples,
|
304
|
+
"message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
|
305
|
+
"message_list": messages_for_reporting
|
306
|
+
}
|
247
307
|
trial.set_user_attr("score", score)
|
248
|
-
trial.set_user_attr("
|
308
|
+
trial.set_user_attr("config", trial_config)
|
249
309
|
return score
|
250
310
|
|
251
311
|
# Configure Optuna Logging
|
@@ -265,29 +325,18 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
265
325
|
sampler = optuna.samplers.TPESampler(seed=self.seed)
|
266
326
|
study = optuna.create_study(direction="maximize", sampler=sampler)
|
267
327
|
|
268
|
-
study.optimize(
|
269
|
-
|
270
|
-
|
328
|
+
study.optimize(
|
329
|
+
optimization_objective,
|
330
|
+
n_trials=n_trials,
|
331
|
+
show_progress_bar=False
|
332
|
+
)
|
333
|
+
|
271
334
|
optuna_history_processed = []
|
272
335
|
for trial_idx, trial in enumerate(study.trials):
|
273
336
|
if trial.state == optuna.trial.TrialState.COMPLETE:
|
274
|
-
|
275
|
-
prompt_cand_display =
|
276
|
-
|
277
|
-
try:
|
278
|
-
# .format() on ChatPromptTemplate returns the list of messages
|
279
|
-
chat_messages_for_history = param_obj.as_template().format()
|
280
|
-
prompt_cand_display = json.dumps(chat_messages_for_history)
|
281
|
-
except Exception as e_param_format:
|
282
|
-
logger.warning(f"Trial {trial.number}: Error formatting prompt from param_obj: {e_param_format}")
|
283
|
-
prompt_cand_display = "Error: Could not format prompt content."
|
284
|
-
elif not param_obj:
|
285
|
-
logger.warning(f"Trial {trial.number}: 'param' object not found in user_attrs.")
|
286
|
-
prompt_cand_display = "Error: Prompt data missing in trial."
|
287
|
-
else:
|
288
|
-
logger.warning(f"Trial {trial.number}: 'param' object is not of expected type or lacks methods.")
|
289
|
-
prompt_cand_display = "Error: Invalid prompt data structure in trial."
|
290
|
-
|
337
|
+
trial_config = trial.user_attrs.get("config", {})
|
338
|
+
prompt_cand_display = trial_config.get('message_list') # Default to None
|
339
|
+
|
291
340
|
score_val = trial.value # This can be None if trial failed to produce a score
|
292
341
|
duration_val = None
|
293
342
|
if trial.datetime_complete and trial.datetime_start:
|
@@ -298,16 +347,13 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
298
347
|
"timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
|
299
348
|
"prompt_candidate": prompt_cand_display,
|
300
349
|
"parameters_used": {
|
301
|
-
"optuna_params": trial.
|
350
|
+
"optuna_params": trial.user_attrs.get("config", {}),
|
302
351
|
"example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
|
303
352
|
},
|
304
353
|
"scores": [{
|
305
|
-
"metric_name":
|
354
|
+
"metric_name": metric.__name__,
|
306
355
|
"score": score_val, # Can be None
|
307
|
-
"opik_evaluation_id": None # TODO
|
308
356
|
}],
|
309
|
-
"tokens_used": None, # TODO
|
310
|
-
"cost": None, # TODO
|
311
357
|
"duration_seconds": duration_val,
|
312
358
|
}
|
313
359
|
optuna_history_processed.append(iter_detail)
|
@@ -316,33 +362,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
316
362
|
|
317
363
|
best_trial = study.best_trial
|
318
364
|
best_score = best_trial.value
|
319
|
-
best_n_examples = best_trial.params["n_examples"]
|
320
365
|
best_example_indices = best_trial.user_attrs.get("example_indices", [])
|
321
|
-
best_param: prompt_parameter.ChatPromptParameter = best_trial.user_attrs[
|
322
|
-
"param"
|
323
|
-
]
|
324
366
|
|
325
|
-
|
326
|
-
|
367
|
+
reporting.display_result(
|
368
|
+
initial_score=baseline_score,
|
369
|
+
best_score=best_score,
|
370
|
+
best_prompt=best_trial.user_attrs["config"]["message_list"],
|
371
|
+
verbose=self.verbose
|
372
|
+
)
|
327
373
|
|
328
374
|
return optimization_result.OptimizationResult(
|
329
375
|
optimizer=self.__class__.__name__,
|
330
|
-
prompt=
|
376
|
+
prompt=best_trial.user_attrs["config"]["message_list"],
|
331
377
|
score=best_score,
|
332
|
-
metric_name=
|
378
|
+
metric_name=metric.__name__,
|
333
379
|
details={
|
334
|
-
"
|
335
|
-
"
|
336
|
-
"
|
337
|
-
"n_examples": best_n_examples,
|
380
|
+
"chat_messages": best_trial.user_attrs["config"]["message_list"],
|
381
|
+
"prompt_parameter": best_trial.user_attrs["config"],
|
382
|
+
#"n_examples": best_n_examples,
|
338
383
|
"example_indices": best_example_indices,
|
339
384
|
"trial_number": best_trial.number,
|
340
|
-
"initial_score": initial_score,
|
341
385
|
"total_trials": n_trials,
|
342
386
|
"rounds": [],
|
343
387
|
"stopped_early": False,
|
344
|
-
"
|
345
|
-
"task_config": task_config.model_dump(),
|
388
|
+
"metric_name": metric.__name__,
|
346
389
|
"model": self.model,
|
347
390
|
"temperature": self.model_kwargs.get("temperature"),
|
348
391
|
},
|
@@ -350,20 +393,32 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
350
393
|
llm_calls=self.llm_call_counter
|
351
394
|
)
|
352
395
|
|
353
|
-
def optimize_prompt(
|
396
|
+
def optimize_prompt( # type: ignore
|
354
397
|
self,
|
355
|
-
|
356
|
-
|
357
|
-
|
398
|
+
prompt: chat_prompt.ChatPrompt,
|
399
|
+
dataset: Dataset,
|
400
|
+
metric: Callable,
|
358
401
|
n_trials: int = 10,
|
359
402
|
experiment_config: Optional[Dict] = None,
|
360
|
-
n_samples: int = None,
|
403
|
+
n_samples: Optional[int] = None,
|
361
404
|
) -> optimization_result.OptimizationResult:
|
405
|
+
"""
|
406
|
+
Args:
|
407
|
+
prompt: The prompt to optimize
|
408
|
+
dataset: Opik Dataset to optimize on
|
409
|
+
metric: Metric function to evaluate on
|
410
|
+
n_trials: Number of trials for Bayesian Optimization
|
411
|
+
experiment_config: Optional configuration for the experiment, useful to log additional metadata
|
412
|
+
n_samples: Optional number of items to test in the dataset
|
413
|
+
|
414
|
+
Returns:
|
415
|
+
OptimizationResult: Result of the optimization
|
416
|
+
"""
|
362
417
|
optimization = None
|
363
418
|
try:
|
364
419
|
optimization = self._opik_client.create_optimization(
|
365
420
|
dataset_name=dataset.name,
|
366
|
-
objective_name=
|
421
|
+
objective_name=metric.__name__,
|
367
422
|
metadata={"optimizer": self.__class__.__name__},
|
368
423
|
)
|
369
424
|
except Exception:
|
@@ -373,72 +428,109 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
373
428
|
optimization = None
|
374
429
|
|
375
430
|
try:
|
376
|
-
|
377
|
-
|
431
|
+
# Start experiment reporting
|
432
|
+
reporting.display_header("Few-Shot Bayesian Optimizer", verbose=self.verbose)
|
433
|
+
reporting.display_configuration(
|
434
|
+
prompt.formatted_messages,
|
435
|
+
optimizer_config={
|
436
|
+
"optimizer": self.__class__.__name__,
|
437
|
+
"metric": metric.__name__,
|
438
|
+
"n_trials": n_trials,
|
439
|
+
"n_samples": n_samples
|
440
|
+
},
|
441
|
+
verbose=self.verbose
|
442
|
+
)
|
443
|
+
|
444
|
+
utils.disable_experiment_reporting()
|
445
|
+
|
446
|
+
# Step 1. Compute the baseline evaluation
|
447
|
+
with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
|
448
|
+
baseline_score = self.evaluate_prompt(
|
449
|
+
prompt=prompt,
|
450
|
+
dataset=dataset,
|
451
|
+
metric=metric,
|
452
|
+
n_samples=n_samples,
|
453
|
+
optimization_id=optimization.id if optimization is not None else None
|
454
|
+
)
|
455
|
+
|
456
|
+
eval_report.set_score(baseline_score)
|
457
|
+
|
458
|
+
# Step 2. Create the few-shot prompt template
|
459
|
+
with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
|
460
|
+
fewshot_template = self._create_fewshot_prompt_template(
|
461
|
+
model=self.model,
|
462
|
+
prompt=prompt,
|
463
|
+
few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
|
464
|
+
for item in dataset.get_items(nb_samples=10)]
|
465
|
+
)
|
466
|
+
|
467
|
+
fewshot_template_report.set_fewshot_template(fewshot_template)
|
468
|
+
|
469
|
+
# Step 3. Start the optimization process
|
470
|
+
result = self._run_optimization(
|
471
|
+
fewshot_prompt_template=fewshot_template,
|
378
472
|
dataset=dataset,
|
379
|
-
|
380
|
-
|
381
|
-
n_trials=n_trials,
|
473
|
+
metric=metric,
|
474
|
+
optimization_id=optimization.id if optimization is not None else None,
|
382
475
|
experiment_config=experiment_config,
|
476
|
+
n_trials=n_trials,
|
477
|
+
baseline_score=baseline_score,
|
383
478
|
n_samples=n_samples,
|
384
479
|
)
|
385
480
|
if optimization:
|
386
481
|
self.update_optimization(optimization, status="completed")
|
482
|
+
|
483
|
+
utils.enable_experiment_reporting()
|
387
484
|
return result
|
388
485
|
except Exception as e:
|
389
486
|
if optimization:
|
390
487
|
self.update_optimization(optimization, status="cancelled")
|
391
488
|
logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
|
489
|
+
utils.enable_experiment_reporting()
|
392
490
|
raise e
|
393
491
|
|
394
492
|
def evaluate_prompt(
|
395
493
|
self,
|
396
|
-
prompt:
|
494
|
+
prompt: chat_prompt.ChatPrompt,
|
397
495
|
dataset: opik.Dataset,
|
398
|
-
|
399
|
-
task_config: Optional[TaskConfig] = None,
|
496
|
+
metric: Callable,
|
400
497
|
dataset_item_ids: Optional[List[str]] = None,
|
401
498
|
experiment_config: Optional[Dict] = None,
|
402
|
-
|
499
|
+
optimization_id: Optional[str] = None,
|
500
|
+
n_samples: Optional[int] = None,
|
403
501
|
) -> float:
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
{"role": "user", "content": json.dumps(questions)},
|
417
|
-
]
|
418
|
-
|
502
|
+
"""
|
503
|
+
Args:
|
504
|
+
prompt: The prompt to evaluate
|
505
|
+
dataset: Opik Dataset to evaluate the prompt on
|
506
|
+
metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
|
507
|
+
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
508
|
+
experiment_config: Optional configuration for the experiment
|
509
|
+
optimization_id: Optional ID of the optimization
|
510
|
+
n_samples: Optional number of items to test in the dataset
|
511
|
+
Returns:
|
512
|
+
float: The evaluation score
|
513
|
+
"""
|
419
514
|
# Ensure prompt is correctly formatted
|
420
515
|
if not all(
|
421
516
|
isinstance(item, dict) and "role" in item and "content" in item
|
422
|
-
for item in prompt
|
517
|
+
for item in prompt.formatted_messages
|
423
518
|
):
|
424
519
|
raise ValueError(
|
425
520
|
"A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
|
426
521
|
)
|
427
522
|
|
428
|
-
|
429
|
-
prompt, validate_placeholders=False
|
430
|
-
)
|
431
|
-
llm_task = self._build_task_from_prompt_template(template)
|
523
|
+
llm_task = self._build_task_from_messages(prompt.formatted_messages)
|
432
524
|
|
433
525
|
experiment_config = experiment_config or {}
|
434
526
|
experiment_config = {
|
435
527
|
**experiment_config,
|
436
528
|
**{
|
437
529
|
"optimizer": self.__class__.__name__,
|
438
|
-
"metric":
|
530
|
+
"metric": metric.__name__,
|
439
531
|
"dataset": dataset.name,
|
440
532
|
"configuration": {
|
441
|
-
"
|
533
|
+
"prompt": prompt.formatted_messages,
|
442
534
|
},
|
443
535
|
},
|
444
536
|
}
|
@@ -450,27 +542,39 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
450
542
|
all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
|
451
543
|
dataset_item_ids = random.sample(all_ids, n_samples)
|
452
544
|
|
453
|
-
logger.debug(
|
545
|
+
logger.debug("Starting FewShotBayesian evaluation...")
|
454
546
|
score = task_evaluator.evaluate(
|
455
547
|
dataset=dataset,
|
456
548
|
dataset_item_ids=dataset_item_ids,
|
457
|
-
|
549
|
+
metric=metric,
|
458
550
|
evaluated_task=llm_task,
|
459
551
|
num_threads=self.n_threads,
|
460
552
|
project_name=self.project_name,
|
461
553
|
experiment_config=experiment_config,
|
554
|
+
optimization_id=optimization_id,
|
462
555
|
verbose=self.verbose,
|
463
556
|
)
|
464
557
|
logger.debug(f"Evaluation score: {score:.4f}")
|
465
558
|
|
466
559
|
return score
|
467
560
|
|
468
|
-
|
469
|
-
|
561
|
+
|
562
|
+
def _build_task_from_messages(
|
563
|
+
self, messages: List[Dict[str, str]], few_shot_examples: Optional[str] = None
|
470
564
|
):
|
471
565
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
|
472
|
-
|
473
|
-
|
566
|
+
for key, value in dataset_item.items():
|
567
|
+
prompt_ = [{
|
568
|
+
"role": item["role"],
|
569
|
+
"content": item["content"].replace("{" + key + "}", str(value))
|
570
|
+
} for item in messages]
|
571
|
+
|
572
|
+
if few_shot_examples:
|
573
|
+
prompt_ = [{
|
574
|
+
"role": item["role"],
|
575
|
+
"content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
|
576
|
+
} for item in prompt_]
|
577
|
+
|
474
578
|
response = self._call_model(
|
475
579
|
model=self.model,
|
476
580
|
messages=prompt_,
|