opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -5
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +38 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +22 -13
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +89 -58
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +117 -14
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.1.dist-info/RECORD +50 -0
- opik_optimizer-0.9.2.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,24 @@
|
|
1
|
-
import
|
1
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
2
|
+
|
2
3
|
import json
|
3
4
|
import logging
|
4
5
|
import random
|
5
6
|
from datetime import datetime
|
6
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple
|
7
7
|
|
8
8
|
import litellm
|
9
|
-
import opik
|
10
9
|
import optuna
|
11
10
|
import optuna.samplers
|
11
|
+
|
12
|
+
import opik
|
12
13
|
from opik import Dataset
|
13
14
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
14
15
|
from pydantic import BaseModel
|
15
16
|
|
16
17
|
from opik_optimizer import base_optimizer
|
17
|
-
from
|
18
|
-
|
18
|
+
from ..utils import create_litellm_agent_class
|
19
|
+
from ..optimization_config import chat_prompt, mappers
|
20
|
+
from ..optimizable_agent import OptimizableAgent
|
19
21
|
from .. import _throttle, optimization_result, task_evaluator, utils
|
20
|
-
from ..optimization_config import chat_prompt
|
21
22
|
from . import reporting
|
22
23
|
|
23
24
|
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
@@ -53,37 +54,38 @@ Return your output as a JSON object with:
|
|
53
54
|
Respond only with the JSON object. Do not include any explanation or extra text.
|
54
55
|
"""
|
55
56
|
|
57
|
+
|
56
58
|
class FewShotPromptTemplate(BaseModel):
|
57
59
|
message_list_with_placeholder: List[Dict[str, str]]
|
58
60
|
example_template: str
|
59
61
|
|
62
|
+
|
60
63
|
class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
61
64
|
"""
|
62
65
|
The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
|
63
66
|
employes a two stage pipeline:
|
64
67
|
|
65
|
-
1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
|
68
|
+
1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
|
66
69
|
provided
|
67
70
|
2. We use Bayesian Optimization to determine the best examples to include in the prompt.
|
68
71
|
|
69
72
|
This algorithm is best used when you have a well defined task and would like to guide the LLM
|
70
73
|
by providing some examples.
|
71
74
|
"""
|
75
|
+
|
72
76
|
def __init__(
|
73
77
|
self,
|
74
78
|
model: str,
|
75
|
-
project_name: Optional[str] = "Optimization",
|
76
79
|
min_examples: int = 2,
|
77
80
|
max_examples: int = 8,
|
78
81
|
seed: int = 42,
|
79
82
|
n_threads: int = 8,
|
80
83
|
verbose: int = 1,
|
81
|
-
**model_kwargs,
|
84
|
+
**model_kwargs: Any,
|
82
85
|
) -> None:
|
83
86
|
"""
|
84
87
|
Args:
|
85
88
|
model: The model to used to evaluate the prompt
|
86
|
-
project_name: Optional project name for tracking
|
87
89
|
min_examples: Minimum number of examples to include
|
88
90
|
max_examples: Maximum number of examples to include
|
89
91
|
seed: Random seed for reproducibility
|
@@ -91,38 +93,55 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
91
93
|
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
92
94
|
**model_kwargs: Additional model parameters
|
93
95
|
"""
|
94
|
-
super().__init__(model,
|
96
|
+
super().__init__(model, verbose, **model_kwargs)
|
95
97
|
self.min_examples = min_examples
|
96
98
|
self.max_examples = max_examples
|
97
99
|
self.seed = seed
|
98
100
|
self.n_threads = n_threads
|
99
|
-
self.verbose
|
100
|
-
if verbose == 0:
|
101
|
+
if self.verbose == 0:
|
101
102
|
logger.setLevel(logging.WARNING)
|
102
|
-
elif verbose == 1:
|
103
|
+
elif self.verbose == 1:
|
103
104
|
logger.setLevel(logging.INFO)
|
104
|
-
elif verbose == 2:
|
105
|
+
elif self.verbose == 2:
|
105
106
|
logger.setLevel(logging.DEBUG)
|
106
|
-
|
107
|
+
|
107
108
|
self._opik_client = opik.Opik()
|
108
109
|
self.llm_call_counter = 0
|
109
110
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
110
111
|
|
111
112
|
@_throttle.rate_limited(_limiter)
|
112
|
-
def _call_model(
|
113
|
+
def _call_model(
|
114
|
+
self,
|
115
|
+
model: str,
|
116
|
+
messages: List[Dict[str, str]],
|
117
|
+
seed: int,
|
118
|
+
model_kwargs: Dict[str, Any],
|
119
|
+
) -> Dict[str, Any]:
|
120
|
+
"""
|
121
|
+
Args:
|
122
|
+
model: The model to use for the call
|
123
|
+
messages: List of message dictionaries with 'role' and 'content' keys
|
124
|
+
seed: Random seed for reproducibility
|
125
|
+
model_kwargs: Additional model parameters
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
Dict containing the model's response
|
129
|
+
"""
|
113
130
|
self.llm_call_counter += 1
|
114
131
|
|
115
132
|
current_model_kwargs = self.model_kwargs.copy()
|
116
133
|
current_model_kwargs.update(model_kwargs)
|
117
134
|
|
118
135
|
filtered_call_kwargs = current_model_kwargs.copy()
|
119
|
-
filtered_call_kwargs.pop(
|
120
|
-
filtered_call_kwargs.pop(
|
121
|
-
filtered_call_kwargs.pop(
|
122
|
-
filtered_call_kwargs.pop(
|
123
|
-
filtered_call_kwargs.pop(
|
124
|
-
|
125
|
-
final_params_for_litellm =
|
136
|
+
filtered_call_kwargs.pop("n_trials", None)
|
137
|
+
filtered_call_kwargs.pop("n_samples", None)
|
138
|
+
filtered_call_kwargs.pop("n_iterations", None)
|
139
|
+
filtered_call_kwargs.pop("min_examples", None)
|
140
|
+
filtered_call_kwargs.pop("max_examples", None)
|
141
|
+
|
142
|
+
final_params_for_litellm = (
|
143
|
+
opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
144
|
+
)
|
126
145
|
|
127
146
|
response = litellm.completion(
|
128
147
|
model=self.model,
|
@@ -136,6 +155,16 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
136
155
|
def _split_dataset(
|
137
156
|
self, dataset: List[Dict[str, Any]], train_ratio: float
|
138
157
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
158
|
+
"""
|
159
|
+
Split the dataset into training and validation sets.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
dataset: List of dataset items
|
163
|
+
train_ratio: Ratio of items to use for training
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
Tuple of (train_set, validation_set)
|
167
|
+
"""
|
139
168
|
"""Split the dataset into training and validation sets.
|
140
169
|
|
141
170
|
Args:
|
@@ -159,48 +188,56 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
159
188
|
self,
|
160
189
|
model: str,
|
161
190
|
prompt: chat_prompt.ChatPrompt,
|
162
|
-
few_shot_examples: List[Dict[str, Any]]
|
191
|
+
few_shot_examples: List[Dict[str, Any]],
|
163
192
|
) -> FewShotPromptTemplate:
|
193
|
+
"""
|
194
|
+
Generate a few-shot prompt template that can be used to insert examples into the prompt.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
model: The model to use for generating the template
|
198
|
+
prompt: The base prompt to modify
|
199
|
+
few_shot_examples: List of example pairs with input and output fields
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
FewShotPromptTemplate containing the modified message list and example template
|
203
|
+
"""
|
164
204
|
"""
|
165
205
|
During this step we update the system prompt to include few-shot examples.
|
166
206
|
"""
|
167
207
|
user_message = {
|
168
|
-
"message_list": prompt.
|
169
|
-
"examples": few_shot_examples
|
208
|
+
"message_list": prompt.get_messages(),
|
209
|
+
"examples": few_shot_examples,
|
170
210
|
}
|
171
|
-
|
211
|
+
|
172
212
|
messages: List[Dict[str, str]] = [
|
173
213
|
{"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
|
174
214
|
{"role": "user", "content": json.dumps(user_message)},
|
175
215
|
]
|
176
|
-
|
216
|
+
|
177
217
|
logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
|
178
|
-
response = self._call_model(
|
179
|
-
model,
|
180
|
-
messages,
|
181
|
-
self.seed,
|
182
|
-
self.model_kwargs
|
183
|
-
)
|
218
|
+
response = self._call_model(model, messages, self.seed, self.model_kwargs)
|
184
219
|
logger.debug(f"fewshot_prompt_template - LLM response: {response}")
|
185
220
|
|
186
221
|
try:
|
187
222
|
res = utils.json_to_dict(response["choices"][0]["message"]["content"])
|
188
223
|
return FewShotPromptTemplate(
|
189
224
|
message_list_with_placeholder=res["message_list_with_placeholder"],
|
190
|
-
example_template=res["example_template"]
|
225
|
+
example_template=res["example_template"],
|
191
226
|
)
|
192
227
|
except Exception as e:
|
193
|
-
logger.error(
|
228
|
+
logger.error(
|
229
|
+
f"Failed to compute few-shot prompt template: {e} - response: {response}"
|
230
|
+
)
|
194
231
|
raise
|
195
232
|
|
196
233
|
def _run_optimization(
|
197
234
|
self,
|
198
|
-
|
235
|
+
prompt: chat_prompt.ChatPrompt,
|
199
236
|
fewshot_prompt_template: FewShotPromptTemplate,
|
200
237
|
dataset: Dataset,
|
201
238
|
metric: Callable,
|
239
|
+
baseline_score: float,
|
202
240
|
n_trials: int = 10,
|
203
|
-
baseline_score: Optional[float] = None,
|
204
241
|
optimization_id: Optional[str] = None,
|
205
242
|
experiment_config: Optional[Dict] = None,
|
206
243
|
n_samples: Optional[int] = None,
|
@@ -208,21 +245,22 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
208
245
|
reporting.start_optimization_run(verbose=self.verbose)
|
209
246
|
|
210
247
|
random.seed(self.seed)
|
211
|
-
|
212
|
-
|
248
|
+
|
213
249
|
# Load the dataset
|
214
250
|
dataset_items = dataset.get_items()
|
215
251
|
all_dataset_item_ids = [item["id"] for item in dataset_items]
|
216
252
|
eval_dataset_item_ids = all_dataset_item_ids
|
217
253
|
if n_samples is not None and n_samples < len(dataset_items):
|
218
254
|
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
219
|
-
|
255
|
+
|
220
256
|
# Define the experiment configuration
|
221
257
|
experiment_config = experiment_config or {}
|
222
258
|
base_experiment_config = { # Base config for reuse
|
223
259
|
**experiment_config,
|
224
260
|
**{
|
225
261
|
"optimizer": self.__class__.__name__,
|
262
|
+
"agent_class": self.agent_class.__name__,
|
263
|
+
"agent_config": prompt.to_dict(),
|
226
264
|
"metric": metric.__name__,
|
227
265
|
"dataset": dataset.name,
|
228
266
|
"configuration": {},
|
@@ -251,32 +289,44 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
251
289
|
for key, value in example.items():
|
252
290
|
processed_example[key] = str(value)
|
253
291
|
|
254
|
-
processed_demo_example=fewshot_prompt_template.example_template
|
292
|
+
processed_demo_example = fewshot_prompt_template.example_template
|
255
293
|
for key, value in processed_example.items():
|
256
294
|
try:
|
257
|
-
processed_demo_example=processed_demo_example.replace(
|
295
|
+
processed_demo_example = processed_demo_example.replace(
|
296
|
+
f"{{{key}}}", str(value)
|
297
|
+
)
|
258
298
|
except Exception:
|
259
|
-
logger.error(
|
299
|
+
logger.error(
|
300
|
+
f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} "
|
301
|
+
)
|
260
302
|
raise
|
261
303
|
processed_demo_examples.append(processed_demo_example)
|
262
304
|
few_shot_examples = "\n\n".join(processed_demo_examples)
|
263
|
-
|
305
|
+
|
264
306
|
llm_task = self._build_task_from_messages(
|
307
|
+
prompt=prompt,
|
265
308
|
messages=fewshot_prompt_template.message_list_with_placeholder,
|
266
|
-
few_shot_examples=few_shot_examples
|
309
|
+
few_shot_examples=few_shot_examples,
|
267
310
|
)
|
268
311
|
|
269
|
-
messages_for_reporting = [
|
270
|
-
|
271
|
-
|
272
|
-
|
312
|
+
messages_for_reporting = [
|
313
|
+
{
|
314
|
+
"role": item["role"],
|
315
|
+
"content": item["content"].replace(
|
316
|
+
FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
|
317
|
+
),
|
318
|
+
}
|
319
|
+
for item in fewshot_prompt_template.message_list_with_placeholder
|
320
|
+
]
|
273
321
|
|
274
322
|
# Log trial config
|
275
323
|
trial_config = base_experiment_config.copy()
|
276
|
-
trial_config["configuration"]["prompt"] =
|
277
|
-
|
278
|
-
|
279
|
-
] =
|
324
|
+
trial_config["configuration"]["prompt"] = (
|
325
|
+
messages_for_reporting # Base instruction
|
326
|
+
)
|
327
|
+
trial_config["configuration"]["examples"] = (
|
328
|
+
processed_demo_examples # Log stringified examples
|
329
|
+
)
|
280
330
|
trial_config["configuration"]["n_examples"] = n_examples
|
281
331
|
trial_config["configuration"]["example_indices"] = example_indices
|
282
332
|
|
@@ -285,7 +335,9 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
285
335
|
)
|
286
336
|
logger.debug(f"Evaluating trial {trial.number}...")
|
287
337
|
|
288
|
-
with reporting.start_optimization_trial(
|
338
|
+
with reporting.start_optimization_trial(
|
339
|
+
trial.number, n_trials, verbose=self.verbose
|
340
|
+
) as trial_reporter:
|
289
341
|
trial_reporter.start_trial(messages_for_reporting)
|
290
342
|
score = task_evaluator.evaluate(
|
291
343
|
dataset=dataset,
|
@@ -293,7 +345,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
293
345
|
metric=metric,
|
294
346
|
evaluated_task=llm_task,
|
295
347
|
num_threads=self.n_threads,
|
296
|
-
project_name=self.project_name,
|
348
|
+
project_name=self.agent_class.project_name,
|
297
349
|
experiment_config=trial_config,
|
298
350
|
optimization_id=optimization_id,
|
299
351
|
verbose=self.verbose,
|
@@ -305,7 +357,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
305
357
|
trial_config = {
|
306
358
|
"demo_examples": demo_examples,
|
307
359
|
"message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
|
308
|
-
"message_list": messages_for_reporting
|
360
|
+
"message_list": messages_for_reporting,
|
309
361
|
}
|
310
362
|
trial.set_user_attr("score", score)
|
311
363
|
trial.set_user_attr("config", trial_config)
|
@@ -327,41 +379,55 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
327
379
|
# Explicitly create and seed the sampler for Optuna
|
328
380
|
sampler = optuna.samplers.TPESampler(seed=self.seed)
|
329
381
|
study = optuna.create_study(direction="maximize", sampler=sampler)
|
330
|
-
|
382
|
+
|
331
383
|
study.optimize(
|
332
|
-
optimization_objective,
|
333
|
-
n_trials=n_trials,
|
334
|
-
show_progress_bar=False
|
384
|
+
optimization_objective, n_trials=n_trials, show_progress_bar=False
|
335
385
|
)
|
336
|
-
|
386
|
+
|
337
387
|
optuna_history_processed = []
|
338
388
|
for trial_idx, trial in enumerate(study.trials):
|
339
389
|
if trial.state == optuna.trial.TrialState.COMPLETE:
|
340
390
|
trial_config = trial.user_attrs.get("config", {})
|
341
|
-
prompt_cand_display = trial_config.get(
|
342
|
-
|
343
|
-
|
391
|
+
prompt_cand_display = trial_config.get(
|
392
|
+
"message_list"
|
393
|
+
) # Default to None
|
394
|
+
|
395
|
+
score_val = (
|
396
|
+
trial.value
|
397
|
+
) # This can be None if trial failed to produce a score
|
344
398
|
duration_val = None
|
345
399
|
if trial.datetime_complete and trial.datetime_start:
|
346
|
-
duration_val = (
|
400
|
+
duration_val = (
|
401
|
+
trial.datetime_complete - trial.datetime_start
|
402
|
+
).total_seconds()
|
347
403
|
|
348
404
|
iter_detail = {
|
349
|
-
"iteration": trial.number + 1,
|
350
|
-
"timestamp":
|
405
|
+
"iteration": trial.number + 1,
|
406
|
+
"timestamp": (
|
407
|
+
trial.datetime_start.isoformat()
|
408
|
+
if trial.datetime_start
|
409
|
+
else datetime.now().isoformat()
|
410
|
+
),
|
351
411
|
"prompt_candidate": prompt_cand_display,
|
352
|
-
"parameters_used": {
|
353
|
-
"optuna_params": trial.user_attrs.get("config", {}),
|
354
|
-
"example_indices": trial.user_attrs.get(
|
412
|
+
"parameters_used": {
|
413
|
+
"optuna_params": trial.user_attrs.get("config", {}),
|
414
|
+
"example_indices": trial.user_attrs.get(
|
415
|
+
"example_indices", []
|
416
|
+
), # Default to empty list
|
355
417
|
},
|
356
|
-
"scores": [
|
357
|
-
|
358
|
-
|
359
|
-
|
418
|
+
"scores": [
|
419
|
+
{
|
420
|
+
"metric_name": metric.__name__,
|
421
|
+
"score": score_val, # Can be None
|
422
|
+
}
|
423
|
+
],
|
360
424
|
"duration_seconds": duration_val,
|
361
425
|
}
|
362
426
|
optuna_history_processed.append(iter_detail)
|
363
427
|
else:
|
364
|
-
logger.warning(
|
428
|
+
logger.warning(
|
429
|
+
f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}"
|
430
|
+
)
|
365
431
|
|
366
432
|
best_trial = study.best_trial
|
367
433
|
best_score = best_trial.value
|
@@ -369,29 +435,33 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
369
435
|
|
370
436
|
if best_score <= baseline_score:
|
371
437
|
best_score = baseline_score
|
372
|
-
best_prompt =
|
438
|
+
best_prompt = prompt.get_messages()
|
373
439
|
else:
|
374
440
|
best_prompt = best_trial.user_attrs["config"]["message_list"]
|
375
441
|
|
376
442
|
reporting.display_result(
|
377
443
|
initial_score=baseline_score,
|
378
444
|
best_score=best_score,
|
379
|
-
best_prompt=
|
380
|
-
verbose=self.verbose
|
445
|
+
best_prompt=best_prompt,
|
446
|
+
verbose=self.verbose,
|
381
447
|
)
|
382
448
|
|
383
449
|
return optimization_result.OptimizationResult(
|
384
450
|
optimizer=self.__class__.__name__,
|
385
|
-
prompt=
|
386
|
-
initial_prompt=
|
451
|
+
prompt=best_prompt,
|
452
|
+
initial_prompt=prompt.get_messages(),
|
387
453
|
initial_score=baseline_score,
|
388
454
|
score=best_score,
|
389
455
|
metric_name=metric.__name__,
|
390
456
|
details={
|
391
457
|
"initial_score": baseline_score,
|
392
|
-
"chat_messages":
|
458
|
+
"chat_messages": (
|
459
|
+
best_trial.user_attrs["config"]["message_list"]
|
460
|
+
if best_trial.user_attrs["config"]
|
461
|
+
else []
|
462
|
+
),
|
393
463
|
"prompt_parameter": best_trial.user_attrs["config"],
|
394
|
-
#"n_examples": best_n_examples,
|
464
|
+
# "n_examples": best_n_examples,
|
395
465
|
"example_indices": best_example_indices,
|
396
466
|
"trial_number": best_trial.number,
|
397
467
|
"total_trials": n_trials,
|
@@ -402,39 +472,53 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
402
472
|
"temperature": self.model_kwargs.get("temperature"),
|
403
473
|
},
|
404
474
|
history=optuna_history_processed,
|
405
|
-
llm_calls=self.llm_call_counter
|
475
|
+
llm_calls=self.llm_call_counter,
|
476
|
+
dataset_id=dataset.id,
|
477
|
+
optimization_id=optimization_id,
|
406
478
|
)
|
407
479
|
|
408
|
-
def optimize_prompt(
|
480
|
+
def optimize_prompt( # type: ignore
|
409
481
|
self,
|
410
482
|
prompt: chat_prompt.ChatPrompt,
|
411
483
|
dataset: Dataset,
|
412
484
|
metric: Callable,
|
413
485
|
n_trials: int = 10,
|
486
|
+
agent_class: Optional[Type[OptimizableAgent]] = None,
|
414
487
|
experiment_config: Optional[Dict] = None,
|
415
488
|
n_samples: Optional[int] = None,
|
416
489
|
) -> optimization_result.OptimizationResult:
|
417
490
|
"""
|
418
491
|
Args:
|
419
|
-
prompt:
|
492
|
+
prompt:
|
420
493
|
dataset: Opik Dataset to optimize on
|
421
494
|
metric: Metric function to evaluate on
|
422
495
|
n_trials: Number of trials for Bayesian Optimization
|
423
496
|
experiment_config: Optional configuration for the experiment, useful to log additional metadata
|
424
497
|
n_samples: Optional number of items to test in the dataset
|
425
|
-
|
498
|
+
|
426
499
|
Returns:
|
427
500
|
OptimizationResult: Result of the optimization
|
428
501
|
"""
|
429
502
|
if not isinstance(prompt, chat_prompt.ChatPrompt):
|
430
503
|
raise ValueError("Prompt must be a ChatPrompt object")
|
431
|
-
|
504
|
+
|
432
505
|
if not isinstance(dataset, Dataset):
|
433
506
|
raise ValueError("Dataset must be a Dataset object")
|
434
|
-
|
435
|
-
if not isinstance(metric, Callable):
|
436
|
-
raise ValueError("Metric must be a function that takes `dataset_item` and `llm_output` as arguments.")
|
437
507
|
|
508
|
+
if not callable(metric):
|
509
|
+
raise ValueError(
|
510
|
+
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
511
|
+
)
|
512
|
+
|
513
|
+
if prompt.model is None:
|
514
|
+
prompt.model = self.model
|
515
|
+
if prompt.model_kwargs is None:
|
516
|
+
prompt.model_kwargs = self.model_kwargs
|
517
|
+
|
518
|
+
if agent_class is None:
|
519
|
+
self.agent_class = create_litellm_agent_class(prompt)
|
520
|
+
else:
|
521
|
+
self.agent_class = agent_class
|
438
522
|
|
439
523
|
optimization = None
|
440
524
|
try:
|
@@ -451,87 +535,87 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
451
535
|
optimization = None
|
452
536
|
optimization_run_id = None
|
453
537
|
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
)
|
472
|
-
|
473
|
-
utils.disable_experiment_reporting()
|
474
|
-
|
475
|
-
# Step 1. Compute the baseline evaluation
|
476
|
-
with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
|
477
|
-
baseline_score = self.evaluate_prompt(
|
478
|
-
prompt=prompt,
|
479
|
-
dataset=dataset,
|
480
|
-
metric=metric,
|
481
|
-
n_samples=n_samples,
|
482
|
-
optimization_id=optimization.id if optimization is not None else None
|
483
|
-
)
|
484
|
-
|
485
|
-
eval_report.set_score(baseline_score)
|
486
|
-
|
487
|
-
# Step 2. Create the few-shot prompt template
|
488
|
-
with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
|
489
|
-
fewshot_template = self._create_fewshot_prompt_template(
|
490
|
-
model=self.model,
|
491
|
-
prompt=prompt,
|
492
|
-
few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
|
493
|
-
for item in dataset.get_items(nb_samples=10)]
|
494
|
-
)
|
538
|
+
# Start experiment reporting
|
539
|
+
reporting.display_header(
|
540
|
+
algorithm=self.__class__.__name__,
|
541
|
+
optimization_id=optimization_run_id,
|
542
|
+
dataset_id=dataset.id,
|
543
|
+
verbose=self.verbose,
|
544
|
+
)
|
545
|
+
reporting.display_configuration(
|
546
|
+
prompt.get_messages(),
|
547
|
+
optimizer_config={
|
548
|
+
"optimizer": self.__class__.__name__,
|
549
|
+
"metric": metric.__name__,
|
550
|
+
"n_trials": n_trials,
|
551
|
+
"n_samples": n_samples,
|
552
|
+
},
|
553
|
+
verbose=self.verbose,
|
554
|
+
)
|
495
555
|
|
496
|
-
|
556
|
+
utils.disable_experiment_reporting()
|
497
557
|
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
558
|
+
# Step 1. Compute the baseline evaluation
|
559
|
+
with reporting.display_evaluation(
|
560
|
+
message="First we will establish the baseline performance:",
|
561
|
+
verbose=self.verbose,
|
562
|
+
) as eval_report:
|
563
|
+
baseline_score = self._evaluate_prompt(
|
564
|
+
prompt,
|
502
565
|
dataset=dataset,
|
503
566
|
metric=metric,
|
504
|
-
optimization_id=optimization.id if optimization is not None else None,
|
505
|
-
experiment_config=experiment_config,
|
506
|
-
n_trials=n_trials,
|
507
|
-
baseline_score=baseline_score,
|
508
567
|
n_samples=n_samples,
|
568
|
+
optimization_id=(optimization.id if optimization is not None else None),
|
509
569
|
)
|
510
|
-
if optimization:
|
511
|
-
self.update_optimization(optimization, status="completed")
|
512
570
|
|
513
|
-
|
514
|
-
return result
|
515
|
-
except Exception as e:
|
516
|
-
if optimization:
|
517
|
-
self.update_optimization(optimization, status="cancelled")
|
518
|
-
logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
|
519
|
-
utils.enable_experiment_reporting()
|
520
|
-
raise e
|
571
|
+
eval_report.set_score(baseline_score)
|
521
572
|
|
522
|
-
|
573
|
+
# Step 2. Create the few-shot prompt template
|
574
|
+
with reporting.creation_few_shot_prompt_template(
|
575
|
+
verbose=self.verbose
|
576
|
+
) as fewshot_template_report:
|
577
|
+
fewshot_template = self._create_fewshot_prompt_template(
|
578
|
+
model=self.model,
|
579
|
+
prompt=prompt,
|
580
|
+
few_shot_examples=[
|
581
|
+
{k: v for k, v in item.items() if k != "id"}
|
582
|
+
for item in dataset.get_items(nb_samples=10)
|
583
|
+
],
|
584
|
+
)
|
585
|
+
|
586
|
+
fewshot_template_report.set_fewshot_template(fewshot_template)
|
587
|
+
|
588
|
+
# Step 3. Start the optimization process
|
589
|
+
result = self._run_optimization(
|
590
|
+
prompt=prompt,
|
591
|
+
fewshot_prompt_template=fewshot_template,
|
592
|
+
dataset=dataset,
|
593
|
+
metric=metric,
|
594
|
+
baseline_score=baseline_score,
|
595
|
+
optimization_id=optimization.id if optimization is not None else None,
|
596
|
+
experiment_config=experiment_config,
|
597
|
+
n_trials=n_trials,
|
598
|
+
n_samples=n_samples,
|
599
|
+
)
|
600
|
+
if optimization:
|
601
|
+
self.update_optimization(optimization, status="completed")
|
602
|
+
|
603
|
+
utils.enable_experiment_reporting()
|
604
|
+
return result
|
605
|
+
|
606
|
+
def _evaluate_prompt(
|
523
607
|
self,
|
524
608
|
prompt: chat_prompt.ChatPrompt,
|
525
609
|
dataset: opik.Dataset,
|
526
610
|
metric: Callable,
|
611
|
+
n_samples: Optional[int] = None,
|
527
612
|
dataset_item_ids: Optional[List[str]] = None,
|
528
613
|
experiment_config: Optional[Dict] = None,
|
529
614
|
optimization_id: Optional[str] = None,
|
530
|
-
|
615
|
+
**kwargs: Any,
|
531
616
|
) -> float:
|
532
617
|
"""
|
533
618
|
Args:
|
534
|
-
prompt: The prompt to evaluate
|
535
619
|
dataset: Opik Dataset to evaluate the prompt on
|
536
620
|
metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
|
537
621
|
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
@@ -541,27 +625,19 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
541
625
|
Returns:
|
542
626
|
float: The evaluation score
|
543
627
|
"""
|
544
|
-
|
545
|
-
if not all(
|
546
|
-
isinstance(item, dict) and "role" in item and "content" in item
|
547
|
-
for item in prompt.formatted_messages
|
548
|
-
):
|
549
|
-
raise ValueError(
|
550
|
-
"A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
|
551
|
-
)
|
552
|
-
|
553
|
-
llm_task = self._build_task_from_messages(prompt.formatted_messages)
|
628
|
+
llm_task = self._build_task_from_messages(prompt, prompt.get_messages())
|
554
629
|
|
555
630
|
experiment_config = experiment_config or {}
|
631
|
+
experiment_config["project_name"] = self.agent_class.__name__
|
556
632
|
experiment_config = {
|
557
633
|
**experiment_config,
|
558
634
|
**{
|
559
635
|
"optimizer": self.__class__.__name__,
|
636
|
+
"agent_class": self.agent_class.__name__,
|
637
|
+
"agent_config": prompt.to_dict(),
|
560
638
|
"metric": metric.__name__,
|
561
639
|
"dataset": dataset.name,
|
562
|
-
"configuration": {
|
563
|
-
"prompt": prompt.formatted_messages,
|
564
|
-
},
|
640
|
+
"configuration": {"prompt": prompt.get_messages()},
|
565
641
|
},
|
566
642
|
}
|
567
643
|
|
@@ -579,7 +655,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
579
655
|
metric=metric,
|
580
656
|
evaluated_task=llm_task,
|
581
657
|
num_threads=self.n_threads,
|
582
|
-
project_name=self.project_name,
|
658
|
+
project_name=self.agent_class.project_name,
|
583
659
|
experiment_config=experiment_config,
|
584
660
|
optimization_id=optimization_id,
|
585
661
|
verbose=self.verbose,
|
@@ -588,29 +664,36 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
588
664
|
|
589
665
|
return score
|
590
666
|
|
591
|
-
|
592
667
|
def _build_task_from_messages(
|
593
|
-
self,
|
594
|
-
|
668
|
+
self,
|
669
|
+
prompt: chat_prompt.ChatPrompt,
|
670
|
+
messages: List[Dict[str, str]],
|
671
|
+
few_shot_examples: Optional[str] = None,
|
672
|
+
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
|
673
|
+
new_prompt = prompt.copy()
|
674
|
+
new_prompt.set_messages(messages)
|
675
|
+
agent = self.agent_class(new_prompt)
|
676
|
+
|
595
677
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
678
|
+
"""
|
679
|
+
Process a single dataset item through the LLM task.
|
680
|
+
|
681
|
+
Args:
|
682
|
+
dataset_item: Dictionary containing the dataset item data
|
683
|
+
|
684
|
+
Returns:
|
685
|
+
Dictionary containing the LLM's response
|
686
|
+
"""
|
687
|
+
messages = new_prompt.get_messages(dataset_item)
|
600
688
|
|
601
689
|
if few_shot_examples:
|
602
|
-
for
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
model=self.model,
|
607
|
-
messages=prompt_,
|
608
|
-
seed=self.seed,
|
609
|
-
model_kwargs=self.model_kwargs
|
610
|
-
)
|
690
|
+
for message in messages:
|
691
|
+
message["content"] = message["content"].replace(
|
692
|
+
FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
|
693
|
+
)
|
611
694
|
|
612
|
-
|
613
|
-
|
614
|
-
}
|
695
|
+
result = agent.invoke(messages, seed=self.seed)
|
696
|
+
|
697
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: result}
|
615
698
|
|
616
699
|
return llm_task
|