opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +15 -26
- opik_optimizer/base_optimizer.py +28 -44
- opik_optimizer/data/hotpot-500.json +501 -1001
- opik_optimizer/datasets/__init__.py +6 -7
- opik_optimizer/datasets/hotpot_qa.py +2 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
- opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +296 -194
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
- opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
- opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
- opik_optimizer/mipro_optimizer/__init__.py +1 -1
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
- opik_optimizer/mipro_optimizer/utils.py +1 -23
- opik_optimizer/optimization_config/chat_prompt.py +106 -0
- opik_optimizer/optimization_config/configs.py +2 -21
- opik_optimizer/optimization_config/mappers.py +1 -1
- opik_optimizer/optimization_result.py +57 -85
- opik_optimizer/reporting_utils.py +180 -0
- opik_optimizer/task_evaluator.py +33 -25
- opik_optimizer/utils.py +187 -3
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/METADATA +15 -31
- opik_optimizer-0.9.0rc0.dist-info/RECORD +48 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/WHEEL +1 -1
- opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
- opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
- opik_optimizer/integrations/__init__.py +0 -0
- opik_optimizer/meta_prompt_optimizer.py +0 -1151
- opik_optimizer-0.8.0.dist-info/RECORD +0 -45
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0rc0.dist-info}/top_level.txt +0 -0
@@ -1,54 +1,107 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
1
3
|
import random
|
2
|
-
from
|
4
|
+
from datetime import datetime
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
6
|
+
|
7
|
+
import litellm
|
3
8
|
import opik
|
4
9
|
import optuna
|
5
10
|
import optuna.samplers
|
6
|
-
import logging
|
7
|
-
import json
|
8
|
-
from datetime import datetime
|
9
|
-
|
10
11
|
from opik import Dataset
|
11
|
-
from
|
12
|
+
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
13
|
+
from pydantic import BaseModel
|
12
14
|
|
13
|
-
from opik_optimizer.optimization_config.configs import TaskConfig, MetricConfig
|
14
15
|
from opik_optimizer import base_optimizer
|
16
|
+
from opik_optimizer.optimization_config import mappers
|
15
17
|
|
16
|
-
from
|
17
|
-
from
|
18
|
-
from
|
19
|
-
from .. import optimization_result, task_evaluator
|
20
|
-
|
21
|
-
import litellm
|
22
|
-
|
23
|
-
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
18
|
+
from .. import _throttle, optimization_result, task_evaluator, utils
|
19
|
+
from ..optimization_config import chat_prompt
|
20
|
+
from . import reporting
|
24
21
|
|
25
22
|
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
26
23
|
|
27
24
|
logger = logging.getLogger(__name__)
|
28
25
|
|
26
|
+
FEW_SHOT_EXAMPLE_PLACEHOLDER = "FEW_SHOT_EXAMPLE_PLACEHOLDER"
|
27
|
+
SYSTEM_PROMPT_TEMPLATE = f"""
|
28
|
+
You are a prompt editor that modifies a message list to support few-shot learning. Your job is to insert a placeholder where few-shot examples can be inserted and generate a reusable string template for formatting those examples.
|
29
|
+
|
30
|
+
You will receive a JSON object with the following fields:
|
31
|
+
|
32
|
+
- "message_list": a list of messages, each with a role (system, user, or assistant) and a content field.
|
33
|
+
- "examples": a list of example pairs, each with input and output fields.
|
34
|
+
|
35
|
+
Your task:
|
36
|
+
|
37
|
+
- Insert the string "{FEW_SHOT_EXAMPLE_PLACEHOLDER}" into one of the messages in the list. Make sure to:
|
38
|
+
- Insert it at the most logical point for including few-shot examples — typically as part of the system message
|
39
|
+
- Add a section title in XML or markdown format. The examples will be provided as `example_1\nexample_2\n...` with each example following the example template.
|
40
|
+
- Analyze the examples to infer a consistent structure, and create a single string few_shot_example_template using the Python .format() style. Make sure to follow the following instructions:
|
41
|
+
- Unless absolutely relevant, do not return an object but instead a string that can be inserted as part of {FEW_SHOT_EXAMPLE_PLACEHOLDER}
|
42
|
+
- Make sure to include the variables as part of this string so we can before string formatting with actual examples. Only variables available in the examples can be used. Do not use anything else, do not apply any transformations to the variables either.
|
43
|
+
- The few shot examples should include the expected response as the goal is to provide examples of the expected output format.
|
44
|
+
- Ensure the format of the few shot examples are consistent with how the model will be called
|
45
|
+
|
46
|
+
Return your output as a JSON object with:
|
47
|
+
|
48
|
+
- message_list_with_placeholder: the updated list with "FEW_SHOT_EXAMPLE_PLACEHOLDER" inserted.
|
49
|
+
- example_template: a string template using the fields provided in the examples (you don't need to use all of them)
|
50
|
+
|
51
|
+
Respond only with the JSON object. Do not include any explanation or extra text.
|
52
|
+
"""
|
53
|
+
|
54
|
+
class FewShotPromptTemplate(BaseModel):
|
55
|
+
message_list_with_placeholder: List[Dict[str, str]]
|
56
|
+
example_template: str
|
29
57
|
|
30
58
|
class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
59
|
+
"""
|
60
|
+
The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
|
61
|
+
employes a two stage pipeline:
|
62
|
+
|
63
|
+
1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
|
64
|
+
provided
|
65
|
+
2. We use Bayesian Optimization to determine the best examples to include in the prompt.
|
66
|
+
|
67
|
+
This algorithm is best used when you have a well defined task and would like to guide the LLM
|
68
|
+
by providing some examples.
|
69
|
+
"""
|
31
70
|
def __init__(
|
32
71
|
self,
|
33
72
|
model: str,
|
34
|
-
project_name: Optional[str] =
|
73
|
+
project_name: Optional[str] = "Optimization",
|
35
74
|
min_examples: int = 2,
|
36
75
|
max_examples: int = 8,
|
37
76
|
seed: int = 42,
|
38
77
|
n_threads: int = 8,
|
39
|
-
n_initial_prompts: int = 5,
|
40
|
-
n_iterations: int = 10,
|
41
78
|
verbose: int = 1,
|
42
79
|
**model_kwargs,
|
43
80
|
) -> None:
|
81
|
+
"""
|
82
|
+
Args:
|
83
|
+
model: The model to used to evaluate the prompt
|
84
|
+
project_name: Optional project name for tracking
|
85
|
+
min_examples: Minimum number of examples to include
|
86
|
+
max_examples: Maximum number of examples to include
|
87
|
+
seed: Random seed for reproducibility
|
88
|
+
n_threads: Number of threads for parallel evaluation
|
89
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
90
|
+
**model_kwargs: Additional model parameters
|
91
|
+
"""
|
44
92
|
super().__init__(model, project_name, **model_kwargs)
|
45
93
|
self.min_examples = min_examples
|
46
94
|
self.max_examples = max_examples
|
47
95
|
self.seed = seed
|
48
96
|
self.n_threads = n_threads
|
49
|
-
self.n_initial_prompts = n_initial_prompts
|
50
|
-
self.n_iterations = n_iterations
|
51
97
|
self.verbose = verbose
|
98
|
+
if verbose == 0:
|
99
|
+
logger.setLevel(logging.WARNING)
|
100
|
+
elif verbose == 1:
|
101
|
+
logger.setLevel(logging.INFO)
|
102
|
+
elif verbose == 2:
|
103
|
+
logger.setLevel(logging.DEBUG)
|
104
|
+
|
52
105
|
self._opik_client = opik.Opik()
|
53
106
|
self.llm_call_counter = 0
|
54
107
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
@@ -66,7 +119,6 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
66
119
|
filtered_call_kwargs.pop('n_iterations', None)
|
67
120
|
filtered_call_kwargs.pop('min_examples', None)
|
68
121
|
filtered_call_kwargs.pop('max_examples', None)
|
69
|
-
filtered_call_kwargs.pop('n_initial_prompts', None)
|
70
122
|
|
71
123
|
final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
72
124
|
|
@@ -101,90 +153,80 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
101
153
|
split_idx = int(len(dataset) * train_ratio)
|
102
154
|
return dataset[:split_idx], dataset[split_idx:]
|
103
155
|
|
104
|
-
def
|
156
|
+
def _create_fewshot_prompt_template(
|
105
157
|
self,
|
106
|
-
|
107
|
-
|
108
|
-
|
158
|
+
model: str,
|
159
|
+
prompt: chat_prompt.ChatPrompt,
|
160
|
+
few_shot_examples: List[Dict[str, Any]]
|
161
|
+
) -> FewShotPromptTemplate:
|
162
|
+
"""
|
163
|
+
During this step we update the system prompt to include few-shot examples.
|
164
|
+
"""
|
165
|
+
user_message = {
|
166
|
+
"message_list": prompt.formatted_messages,
|
167
|
+
"examples": few_shot_examples
|
168
|
+
}
|
169
|
+
|
170
|
+
messages: List[Dict[str, str]] = [
|
171
|
+
{"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
|
172
|
+
{"role": "user", "content": json.dumps(user_message)},
|
173
|
+
]
|
174
|
+
|
175
|
+
logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
|
176
|
+
response = self._call_model(
|
177
|
+
model,
|
178
|
+
messages,
|
179
|
+
self.seed,
|
180
|
+
self.model_kwargs
|
181
|
+
)
|
182
|
+
logger.debug(f"fewshot_prompt_template - LLM response: {response}")
|
183
|
+
|
184
|
+
try:
|
185
|
+
res = utils.json_to_dict(response["choices"][0]["message"]["content"])
|
186
|
+
return FewShotPromptTemplate(
|
187
|
+
message_list_with_placeholder=res["message_list_with_placeholder"],
|
188
|
+
example_template=res["example_template"]
|
189
|
+
)
|
190
|
+
except Exception as e:
|
191
|
+
logger.error(f"Failed to compute few-shot prompt template: {e} - response: {response}")
|
192
|
+
raise
|
193
|
+
|
194
|
+
def _run_optimization(
|
195
|
+
self,
|
196
|
+
fewshot_prompt_template: FewShotPromptTemplate,
|
197
|
+
dataset: Dataset,
|
198
|
+
metric: Callable,
|
109
199
|
n_trials: int = 10,
|
200
|
+
baseline_score: Optional[float] = None,
|
110
201
|
optimization_id: Optional[str] = None,
|
111
202
|
experiment_config: Optional[Dict] = None,
|
112
|
-
n_samples: int = None,
|
203
|
+
n_samples: Optional[int] = None,
|
113
204
|
) -> optimization_result.OptimizationResult:
|
205
|
+
reporting.start_optimization_run(verbose=self.verbose)
|
206
|
+
|
114
207
|
random.seed(self.seed)
|
115
208
|
self.llm_call_counter = 0
|
116
|
-
|
117
|
-
if not task_config.use_chat_prompt:
|
118
|
-
raise ValueError(
|
119
|
-
"Few-shot Bayesian optimization is only supported for chat prompts."
|
120
|
-
)
|
121
|
-
|
122
|
-
opik_dataset: opik.Dataset = dataset
|
123
|
-
|
209
|
+
|
124
210
|
# Load the dataset
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
211
|
+
dataset_items = dataset.get_items()
|
212
|
+
all_dataset_item_ids = [item["id"] for item in dataset_items]
|
213
|
+
eval_dataset_item_ids = all_dataset_item_ids
|
214
|
+
if n_samples is not None and n_samples < len(dataset_items):
|
215
|
+
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
216
|
+
|
217
|
+
# Define the experiment configuration
|
132
218
|
experiment_config = experiment_config or {}
|
133
219
|
base_experiment_config = { # Base config for reuse
|
134
220
|
**experiment_config,
|
135
221
|
**{
|
136
222
|
"optimizer": self.__class__.__name__,
|
137
|
-
"metric":
|
138
|
-
"dataset":
|
223
|
+
"metric": metric.__name__,
|
224
|
+
"dataset": dataset.name,
|
139
225
|
"configuration": {},
|
140
226
|
},
|
141
227
|
}
|
142
228
|
|
143
|
-
# Evaluate Initial (Zero-Shot) Prompt
|
144
|
-
logger.info("Evaluating initial (zero-shot) prompt...")
|
145
|
-
initial_instruction = task_config.instruction_prompt
|
146
|
-
zero_shot_param = prompt_parameter.ChatPromptParameter(
|
147
|
-
name="zero_shot_prompt",
|
148
|
-
instruction=initial_instruction,
|
149
|
-
task_input_parameters=task_config.input_dataset_fields,
|
150
|
-
task_output_parameter=task_config.output_dataset_field,
|
151
|
-
demo_examples=[], # No examples
|
152
|
-
)
|
153
|
-
zero_shot_llm_task = self._build_task_from_prompt_template(
|
154
|
-
zero_shot_param.as_template()
|
155
|
-
)
|
156
|
-
|
157
|
-
initial_eval_config = base_experiment_config.copy()
|
158
|
-
initial_eval_config["configuration"]["prompt"] = initial_instruction
|
159
|
-
initial_eval_config["configuration"]["n_examples"] = 0
|
160
|
-
|
161
|
-
# Determine dataset item IDs for evaluation (initial and trials)
|
162
|
-
all_dataset_item_ids = [item["id"] for item in dataset_items]
|
163
|
-
eval_dataset_item_ids = all_dataset_item_ids
|
164
|
-
if n_samples is not None and n_samples < len(all_dataset_item_ids):
|
165
|
-
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
166
|
-
logger.info(f"Using {n_samples} samples for evaluations.")
|
167
|
-
else:
|
168
|
-
logger.info(
|
169
|
-
f"Using all {len(all_dataset_item_ids)} samples for evaluations."
|
170
|
-
)
|
171
|
-
|
172
|
-
initial_score = task_evaluator.evaluate(
|
173
|
-
dataset=opik_dataset,
|
174
|
-
dataset_item_ids=eval_dataset_item_ids,
|
175
|
-
metric_config=metric_config,
|
176
|
-
evaluated_task=zero_shot_llm_task,
|
177
|
-
num_threads=self.n_threads,
|
178
|
-
project_name=self.project_name,
|
179
|
-
experiment_config=initial_eval_config,
|
180
|
-
optimization_id=optimization_id,
|
181
|
-
verbose=self.verbose,
|
182
|
-
)
|
183
|
-
logger.info(f"Initial (zero-shot) score: {initial_score:.4f}")
|
184
|
-
|
185
229
|
# Start Optuna Study
|
186
|
-
logger.info("Starting Optuna study for Few-Shot Bayesian Optimization...")
|
187
|
-
|
188
230
|
def optimization_objective(trial: optuna.Trial) -> float:
|
189
231
|
n_examples = trial.suggest_int(
|
190
232
|
"n_examples", self.min_examples, self.max_examples
|
@@ -197,7 +239,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
197
239
|
]
|
198
240
|
trial.set_user_attr("example_indices", example_indices)
|
199
241
|
|
200
|
-
|
242
|
+
# Process few shot examples
|
201
243
|
demo_examples = [dataset_items[idx] for idx in example_indices]
|
202
244
|
|
203
245
|
processed_demo_examples = []
|
@@ -205,21 +247,29 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
205
247
|
processed_example = {}
|
206
248
|
for key, value in example.items():
|
207
249
|
processed_example[key] = str(value)
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
250
|
+
|
251
|
+
try:
|
252
|
+
processed_demo_examples.append(
|
253
|
+
fewshot_prompt_template.example_template.format(**processed_example)
|
254
|
+
)
|
255
|
+
except Exception as e:
|
256
|
+
logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
|
257
|
+
raise
|
258
|
+
few_shot_examples = "\n\n".join(processed_demo_examples)
|
259
|
+
|
260
|
+
llm_task = self._build_task_from_messages(
|
261
|
+
messages=fewshot_prompt_template.message_list_with_placeholder,
|
262
|
+
few_shot_examples=few_shot_examples
|
216
263
|
)
|
217
264
|
|
218
|
-
|
265
|
+
messages_for_reporting = [{
|
266
|
+
"role": item["role"],
|
267
|
+
"content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
|
268
|
+
} for item in fewshot_prompt_template.message_list_with_placeholder]
|
219
269
|
|
220
270
|
# Log trial config
|
221
271
|
trial_config = base_experiment_config.copy()
|
222
|
-
trial_config["configuration"]["prompt"] =
|
272
|
+
trial_config["configuration"]["prompt"] = messages_for_reporting # Base instruction
|
223
273
|
trial_config["configuration"][
|
224
274
|
"examples"
|
225
275
|
] = processed_demo_examples # Log stringified examples
|
@@ -231,21 +281,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
231
281
|
)
|
232
282
|
logger.debug(f"Evaluating trial {trial.number}...")
|
233
283
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
284
|
+
with reporting.start_optimization_trial(trial.number, n_trials, verbose=self.verbose) as trial_reporter:
|
285
|
+
trial_reporter.start_trial(messages_for_reporting)
|
286
|
+
score = task_evaluator.evaluate(
|
287
|
+
dataset=dataset,
|
288
|
+
dataset_item_ids=eval_dataset_item_ids,
|
289
|
+
metric=metric,
|
290
|
+
evaluated_task=llm_task,
|
291
|
+
num_threads=self.n_threads,
|
292
|
+
project_name=self.project_name,
|
293
|
+
experiment_config=trial_config,
|
294
|
+
optimization_id=optimization_id,
|
295
|
+
verbose=self.verbose,
|
296
|
+
)
|
297
|
+
trial_reporter.set_score(baseline_score, score)
|
245
298
|
logger.debug(f"Trial {trial.number} score: {score:.4f}")
|
246
299
|
|
300
|
+
# Trial results
|
301
|
+
trial_config = {
|
302
|
+
"demo_examples": demo_examples,
|
303
|
+
"message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
|
304
|
+
"message_list": messages
|
305
|
+
}
|
247
306
|
trial.set_user_attr("score", score)
|
248
|
-
trial.set_user_attr("
|
307
|
+
trial.set_user_attr("config", trial_config)
|
249
308
|
return score
|
250
309
|
|
251
310
|
# Configure Optuna Logging
|
@@ -265,29 +324,18 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
265
324
|
sampler = optuna.samplers.TPESampler(seed=self.seed)
|
266
325
|
study = optuna.create_study(direction="maximize", sampler=sampler)
|
267
326
|
|
268
|
-
study.optimize(
|
269
|
-
|
270
|
-
|
327
|
+
study.optimize(
|
328
|
+
optimization_objective,
|
329
|
+
n_trials=n_trials,
|
330
|
+
show_progress_bar=False
|
331
|
+
)
|
332
|
+
|
271
333
|
optuna_history_processed = []
|
272
334
|
for trial_idx, trial in enumerate(study.trials):
|
273
335
|
if trial.state == optuna.trial.TrialState.COMPLETE:
|
274
|
-
|
275
|
-
prompt_cand_display =
|
276
|
-
|
277
|
-
try:
|
278
|
-
# .format() on ChatPromptTemplate returns the list of messages
|
279
|
-
chat_messages_for_history = param_obj.as_template().format()
|
280
|
-
prompt_cand_display = json.dumps(chat_messages_for_history)
|
281
|
-
except Exception as e_param_format:
|
282
|
-
logger.warning(f"Trial {trial.number}: Error formatting prompt from param_obj: {e_param_format}")
|
283
|
-
prompt_cand_display = "Error: Could not format prompt content."
|
284
|
-
elif not param_obj:
|
285
|
-
logger.warning(f"Trial {trial.number}: 'param' object not found in user_attrs.")
|
286
|
-
prompt_cand_display = "Error: Prompt data missing in trial."
|
287
|
-
else:
|
288
|
-
logger.warning(f"Trial {trial.number}: 'param' object is not of expected type or lacks methods.")
|
289
|
-
prompt_cand_display = "Error: Invalid prompt data structure in trial."
|
290
|
-
|
336
|
+
trial_config = trial.user_attrs.get("config", {})
|
337
|
+
prompt_cand_display = trial_config.get('message_list') # Default to None
|
338
|
+
|
291
339
|
score_val = trial.value # This can be None if trial failed to produce a score
|
292
340
|
duration_val = None
|
293
341
|
if trial.datetime_complete and trial.datetime_start:
|
@@ -298,16 +346,13 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
298
346
|
"timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
|
299
347
|
"prompt_candidate": prompt_cand_display,
|
300
348
|
"parameters_used": {
|
301
|
-
"optuna_params": trial.
|
349
|
+
"optuna_params": trial.user_attrs.get("config", {}),
|
302
350
|
"example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
|
303
351
|
},
|
304
352
|
"scores": [{
|
305
|
-
"metric_name":
|
353
|
+
"metric_name": metric.__name__,
|
306
354
|
"score": score_val, # Can be None
|
307
|
-
"opik_evaluation_id": None # TODO
|
308
355
|
}],
|
309
|
-
"tokens_used": None, # TODO
|
310
|
-
"cost": None, # TODO
|
311
356
|
"duration_seconds": duration_val,
|
312
357
|
}
|
313
358
|
optuna_history_processed.append(iter_detail)
|
@@ -316,33 +361,30 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
316
361
|
|
317
362
|
best_trial = study.best_trial
|
318
363
|
best_score = best_trial.value
|
319
|
-
best_n_examples = best_trial.params["n_examples"]
|
320
364
|
best_example_indices = best_trial.user_attrs.get("example_indices", [])
|
321
|
-
best_param: prompt_parameter.ChatPromptParameter = best_trial.user_attrs[
|
322
|
-
"param"
|
323
|
-
]
|
324
365
|
|
325
|
-
|
326
|
-
|
366
|
+
reporting.display_result(
|
367
|
+
initial_score=baseline_score,
|
368
|
+
best_score=best_score,
|
369
|
+
best_prompt=best_trial.user_attrs["config"]["message_list"],
|
370
|
+
verbose=self.verbose
|
371
|
+
)
|
327
372
|
|
328
373
|
return optimization_result.OptimizationResult(
|
329
374
|
optimizer=self.__class__.__name__,
|
330
|
-
prompt=
|
375
|
+
prompt=best_trial.user_attrs["config"]["message_list"],
|
331
376
|
score=best_score,
|
332
|
-
metric_name=
|
377
|
+
metric_name=metric.__name__,
|
333
378
|
details={
|
334
|
-
"
|
335
|
-
"
|
336
|
-
"
|
337
|
-
"n_examples": best_n_examples,
|
379
|
+
"chat_messages": best_trial.user_attrs["config"]["message_list"],
|
380
|
+
"prompt_parameter": best_trial.user_attrs["config"],
|
381
|
+
#"n_examples": best_n_examples,
|
338
382
|
"example_indices": best_example_indices,
|
339
383
|
"trial_number": best_trial.number,
|
340
|
-
"initial_score": initial_score,
|
341
384
|
"total_trials": n_trials,
|
342
385
|
"rounds": [],
|
343
386
|
"stopped_early": False,
|
344
|
-
"
|
345
|
-
"task_config": task_config.dict(),
|
387
|
+
"metric_name": metric.__name__,
|
346
388
|
"model": self.model,
|
347
389
|
"temperature": self.model_kwargs.get("temperature"),
|
348
390
|
},
|
@@ -350,20 +392,32 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
350
392
|
llm_calls=self.llm_call_counter
|
351
393
|
)
|
352
394
|
|
353
|
-
def optimize_prompt(
|
395
|
+
def optimize_prompt( # type: ignore
|
354
396
|
self,
|
355
|
-
|
356
|
-
|
357
|
-
|
397
|
+
prompt: chat_prompt.ChatPrompt,
|
398
|
+
dataset: Dataset,
|
399
|
+
metric: Callable,
|
358
400
|
n_trials: int = 10,
|
359
401
|
experiment_config: Optional[Dict] = None,
|
360
|
-
n_samples: int = None,
|
402
|
+
n_samples: Optional[int] = None,
|
361
403
|
) -> optimization_result.OptimizationResult:
|
404
|
+
"""
|
405
|
+
Args:
|
406
|
+
prompt: The prompt to optimize
|
407
|
+
dataset: Opik Dataset to optimize on
|
408
|
+
metric: Metric function to evaluate on
|
409
|
+
n_trials: Number of trials for Bayesian Optimization
|
410
|
+
experiment_config: Optional configuration for the experiment, useful to log additional metadata
|
411
|
+
n_samples: Optional number of items to test in the dataset
|
412
|
+
|
413
|
+
Returns:
|
414
|
+
OptimizationResult: Result of the optimization
|
415
|
+
"""
|
362
416
|
optimization = None
|
363
417
|
try:
|
364
418
|
optimization = self._opik_client.create_optimization(
|
365
419
|
dataset_name=dataset.name,
|
366
|
-
objective_name=
|
420
|
+
objective_name=metric.__name__,
|
367
421
|
metadata={"optimizer": self.__class__.__name__},
|
368
422
|
)
|
369
423
|
except Exception:
|
@@ -373,72 +427,109 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
373
427
|
optimization = None
|
374
428
|
|
375
429
|
try:
|
376
|
-
|
377
|
-
|
430
|
+
# Start experiment reporting
|
431
|
+
reporting.display_header("Few-Shot Bayesian Optimizer", verbose=self.verbose)
|
432
|
+
reporting.display_configuration(
|
433
|
+
prompt.formatted_messages,
|
434
|
+
optimizer_config={
|
435
|
+
"optimizer": self.__class__.__name__,
|
436
|
+
"metric": metric.__name__,
|
437
|
+
"n_trials": n_trials,
|
438
|
+
"n_samples": n_samples
|
439
|
+
},
|
440
|
+
verbose=self.verbose
|
441
|
+
)
|
442
|
+
|
443
|
+
utils.disable_experiment_reporting()
|
444
|
+
|
445
|
+
# Step 1. Compute the baseline evaluation
|
446
|
+
with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
|
447
|
+
baseline_score = self.evaluate_prompt(
|
448
|
+
prompt=prompt,
|
449
|
+
dataset=dataset,
|
450
|
+
metric=metric,
|
451
|
+
n_samples=n_samples,
|
452
|
+
optimization_id=optimization.id if optimization is not None else None
|
453
|
+
)
|
454
|
+
|
455
|
+
eval_report.set_score(baseline_score)
|
456
|
+
|
457
|
+
# Step 2. Create the few-shot prompt template
|
458
|
+
with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
|
459
|
+
fewshot_template = self._create_fewshot_prompt_template(
|
460
|
+
model=self.model,
|
461
|
+
prompt=prompt,
|
462
|
+
few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
|
463
|
+
for item in dataset.get_items(nb_samples=10)]
|
464
|
+
)
|
465
|
+
|
466
|
+
fewshot_template_report.set_fewshot_template(fewshot_template)
|
467
|
+
|
468
|
+
# Step 3. Start the optimization process
|
469
|
+
result = self._run_optimization(
|
470
|
+
fewshot_prompt_template=fewshot_template,
|
378
471
|
dataset=dataset,
|
379
|
-
|
380
|
-
|
381
|
-
n_trials=n_trials,
|
472
|
+
metric=metric,
|
473
|
+
optimization_id=optimization.id if optimization is not None else None,
|
382
474
|
experiment_config=experiment_config,
|
475
|
+
n_trials=n_trials,
|
476
|
+
baseline_score=baseline_score,
|
383
477
|
n_samples=n_samples,
|
384
478
|
)
|
385
479
|
if optimization:
|
386
480
|
self.update_optimization(optimization, status="completed")
|
481
|
+
|
482
|
+
utils.enable_experiment_reporting()
|
387
483
|
return result
|
388
484
|
except Exception as e:
|
389
485
|
if optimization:
|
390
486
|
self.update_optimization(optimization, status="cancelled")
|
391
487
|
logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
|
488
|
+
utils.enable_experiment_reporting()
|
392
489
|
raise e
|
393
490
|
|
394
491
|
def evaluate_prompt(
|
395
492
|
self,
|
396
|
-
prompt:
|
493
|
+
prompt: chat_prompt.ChatPrompt,
|
397
494
|
dataset: opik.Dataset,
|
398
|
-
|
399
|
-
task_config: Optional[TaskConfig] = None,
|
495
|
+
metric: Callable,
|
400
496
|
dataset_item_ids: Optional[List[str]] = None,
|
401
497
|
experiment_config: Optional[Dict] = None,
|
402
|
-
|
498
|
+
optimization_id: Optional[str] = None,
|
499
|
+
n_samples: Optional[int] = None,
|
403
500
|
) -> float:
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
{"role": "user", "content": json.dumps(questions)},
|
417
|
-
]
|
418
|
-
|
501
|
+
"""
|
502
|
+
Args:
|
503
|
+
prompt: The prompt to evaluate
|
504
|
+
dataset: Opik Dataset to evaluate the prompt on
|
505
|
+
metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
|
506
|
+
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
507
|
+
experiment_config: Optional configuration for the experiment
|
508
|
+
optimization_id: Optional ID of the optimization
|
509
|
+
n_samples: Optional number of items to test in the dataset
|
510
|
+
Returns:
|
511
|
+
float: The evaluation score
|
512
|
+
"""
|
419
513
|
# Ensure prompt is correctly formatted
|
420
514
|
if not all(
|
421
515
|
isinstance(item, dict) and "role" in item and "content" in item
|
422
|
-
for item in prompt
|
516
|
+
for item in prompt.formatted_messages
|
423
517
|
):
|
424
518
|
raise ValueError(
|
425
519
|
"A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
|
426
520
|
)
|
427
521
|
|
428
|
-
|
429
|
-
prompt, validate_placeholders=False
|
430
|
-
)
|
431
|
-
llm_task = self._build_task_from_prompt_template(template)
|
522
|
+
llm_task = self._build_task_from_messages(prompt.formatted_messages)
|
432
523
|
|
433
524
|
experiment_config = experiment_config or {}
|
434
525
|
experiment_config = {
|
435
526
|
**experiment_config,
|
436
527
|
**{
|
437
528
|
"optimizer": self.__class__.__name__,
|
438
|
-
"metric":
|
529
|
+
"metric": metric.__name__,
|
439
530
|
"dataset": dataset.name,
|
440
531
|
"configuration": {
|
441
|
-
"
|
532
|
+
"prompt": prompt.formatted_messages,
|
442
533
|
},
|
443
534
|
},
|
444
535
|
}
|
@@ -450,27 +541,38 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
450
541
|
all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
|
451
542
|
dataset_item_ids = random.sample(all_ids, n_samples)
|
452
543
|
|
453
|
-
logger.debug(
|
544
|
+
logger.debug("Starting FewShotBayesian evaluation...")
|
454
545
|
score = task_evaluator.evaluate(
|
455
546
|
dataset=dataset,
|
456
547
|
dataset_item_ids=dataset_item_ids,
|
457
|
-
|
548
|
+
metric=metric,
|
458
549
|
evaluated_task=llm_task,
|
459
550
|
num_threads=self.n_threads,
|
460
551
|
project_name=self.project_name,
|
461
552
|
experiment_config=experiment_config,
|
553
|
+
optimization_id=optimization_id,
|
462
554
|
verbose=self.verbose,
|
463
555
|
)
|
464
556
|
logger.debug(f"Evaluation score: {score:.4f}")
|
465
557
|
|
466
558
|
return score
|
467
559
|
|
468
|
-
|
469
|
-
|
560
|
+
|
561
|
+
def _build_task_from_messages(
|
562
|
+
self, messages: List[Dict[str, str]], few_shot_examples: Optional[str] = None
|
470
563
|
):
|
471
564
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
|
472
|
-
prompt_ =
|
473
|
-
|
565
|
+
prompt_ = [{
|
566
|
+
"role": item["role"],
|
567
|
+
"content": item["content"].format(**dataset_item)
|
568
|
+
} for item in messages]
|
569
|
+
|
570
|
+
if few_shot_examples:
|
571
|
+
prompt_ = [{
|
572
|
+
"role": item["role"],
|
573
|
+
"content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
|
574
|
+
} for item in prompt_]
|
575
|
+
|
474
576
|
response = self._call_model(
|
475
577
|
model=self.model,
|
476
578
|
messages=prompt_,
|
@@ -482,4 +584,4 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
482
584
|
mappers.EVALUATED_LLM_TASK_OUTPUT: response.choices[0].message.content
|
483
585
|
}
|
484
586
|
|
485
|
-
return llm_task
|
587
|
+
return llm_task, messages
|