opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -3
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +41 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +28 -20
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +96 -46
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +122 -37
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.0.dist-info/RECORD +50 -0
- opik_optimizer-0.9.1.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,24 @@
|
|
1
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import random
|
4
6
|
from datetime import datetime
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple
|
6
7
|
|
7
8
|
import litellm
|
8
|
-
import opik
|
9
9
|
import optuna
|
10
10
|
import optuna.samplers
|
11
|
+
|
12
|
+
import opik
|
11
13
|
from opik import Dataset
|
12
14
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
13
15
|
from pydantic import BaseModel
|
14
16
|
|
15
17
|
from opik_optimizer import base_optimizer
|
16
|
-
from
|
17
|
-
|
18
|
+
from ..utils import create_litellm_agent_class
|
19
|
+
from ..optimization_config import chat_prompt, mappers
|
20
|
+
from ..optimizable_agent import OptimizableAgent
|
18
21
|
from .. import _throttle, optimization_result, task_evaluator, utils
|
19
|
-
from ..optimization_config import chat_prompt
|
20
22
|
from . import reporting
|
21
23
|
|
22
24
|
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
@@ -52,37 +54,38 @@ Return your output as a JSON object with:
|
|
52
54
|
Respond only with the JSON object. Do not include any explanation or extra text.
|
53
55
|
"""
|
54
56
|
|
57
|
+
|
55
58
|
class FewShotPromptTemplate(BaseModel):
|
56
59
|
message_list_with_placeholder: List[Dict[str, str]]
|
57
60
|
example_template: str
|
58
61
|
|
62
|
+
|
59
63
|
class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
60
64
|
"""
|
61
65
|
The Few-Shot Bayesian Optimizer can be used to add few-shot examples to prompts. This algorithm
|
62
66
|
employes a two stage pipeline:
|
63
67
|
|
64
|
-
1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
|
68
|
+
1. We generate a few-shot prompt template that is inserted can be inserted into the prompt
|
65
69
|
provided
|
66
70
|
2. We use Bayesian Optimization to determine the best examples to include in the prompt.
|
67
71
|
|
68
72
|
This algorithm is best used when you have a well defined task and would like to guide the LLM
|
69
73
|
by providing some examples.
|
70
74
|
"""
|
75
|
+
|
71
76
|
def __init__(
|
72
77
|
self,
|
73
78
|
model: str,
|
74
|
-
project_name: Optional[str] = "Optimization",
|
75
79
|
min_examples: int = 2,
|
76
80
|
max_examples: int = 8,
|
77
81
|
seed: int = 42,
|
78
82
|
n_threads: int = 8,
|
79
83
|
verbose: int = 1,
|
80
|
-
**model_kwargs,
|
84
|
+
**model_kwargs: Any,
|
81
85
|
) -> None:
|
82
86
|
"""
|
83
87
|
Args:
|
84
88
|
model: The model to used to evaluate the prompt
|
85
|
-
project_name: Optional project name for tracking
|
86
89
|
min_examples: Minimum number of examples to include
|
87
90
|
max_examples: Maximum number of examples to include
|
88
91
|
seed: Random seed for reproducibility
|
@@ -90,38 +93,55 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
90
93
|
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
91
94
|
**model_kwargs: Additional model parameters
|
92
95
|
"""
|
93
|
-
super().__init__(model,
|
96
|
+
super().__init__(model, verbose, **model_kwargs)
|
94
97
|
self.min_examples = min_examples
|
95
98
|
self.max_examples = max_examples
|
96
99
|
self.seed = seed
|
97
100
|
self.n_threads = n_threads
|
98
|
-
self.verbose
|
99
|
-
if verbose == 0:
|
101
|
+
if self.verbose == 0:
|
100
102
|
logger.setLevel(logging.WARNING)
|
101
|
-
elif verbose == 1:
|
103
|
+
elif self.verbose == 1:
|
102
104
|
logger.setLevel(logging.INFO)
|
103
|
-
elif verbose == 2:
|
105
|
+
elif self.verbose == 2:
|
104
106
|
logger.setLevel(logging.DEBUG)
|
105
|
-
|
107
|
+
|
106
108
|
self._opik_client = opik.Opik()
|
107
109
|
self.llm_call_counter = 0
|
108
110
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
109
111
|
|
110
112
|
@_throttle.rate_limited(_limiter)
|
111
|
-
def _call_model(
|
113
|
+
def _call_model(
|
114
|
+
self,
|
115
|
+
model: str,
|
116
|
+
messages: List[Dict[str, str]],
|
117
|
+
seed: int,
|
118
|
+
model_kwargs: Dict[str, Any],
|
119
|
+
) -> Dict[str, Any]:
|
120
|
+
"""
|
121
|
+
Args:
|
122
|
+
model: The model to use for the call
|
123
|
+
messages: List of message dictionaries with 'role' and 'content' keys
|
124
|
+
seed: Random seed for reproducibility
|
125
|
+
model_kwargs: Additional model parameters
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
Dict containing the model's response
|
129
|
+
"""
|
112
130
|
self.llm_call_counter += 1
|
113
131
|
|
114
132
|
current_model_kwargs = self.model_kwargs.copy()
|
115
133
|
current_model_kwargs.update(model_kwargs)
|
116
134
|
|
117
135
|
filtered_call_kwargs = current_model_kwargs.copy()
|
118
|
-
filtered_call_kwargs.pop(
|
119
|
-
filtered_call_kwargs.pop(
|
120
|
-
filtered_call_kwargs.pop(
|
121
|
-
filtered_call_kwargs.pop(
|
122
|
-
filtered_call_kwargs.pop(
|
123
|
-
|
124
|
-
final_params_for_litellm =
|
136
|
+
filtered_call_kwargs.pop("n_trials", None)
|
137
|
+
filtered_call_kwargs.pop("n_samples", None)
|
138
|
+
filtered_call_kwargs.pop("n_iterations", None)
|
139
|
+
filtered_call_kwargs.pop("min_examples", None)
|
140
|
+
filtered_call_kwargs.pop("max_examples", None)
|
141
|
+
|
142
|
+
final_params_for_litellm = (
|
143
|
+
opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
144
|
+
)
|
125
145
|
|
126
146
|
response = litellm.completion(
|
127
147
|
model=self.model,
|
@@ -135,6 +155,16 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
135
155
|
def _split_dataset(
|
136
156
|
self, dataset: List[Dict[str, Any]], train_ratio: float
|
137
157
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
158
|
+
"""
|
159
|
+
Split the dataset into training and validation sets.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
dataset: List of dataset items
|
163
|
+
train_ratio: Ratio of items to use for training
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
Tuple of (train_set, validation_set)
|
167
|
+
"""
|
138
168
|
"""Split the dataset into training and validation sets.
|
139
169
|
|
140
170
|
Args:
|
@@ -158,47 +188,56 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
158
188
|
self,
|
159
189
|
model: str,
|
160
190
|
prompt: chat_prompt.ChatPrompt,
|
161
|
-
few_shot_examples: List[Dict[str, Any]]
|
191
|
+
few_shot_examples: List[Dict[str, Any]],
|
162
192
|
) -> FewShotPromptTemplate:
|
193
|
+
"""
|
194
|
+
Generate a few-shot prompt template that can be used to insert examples into the prompt.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
model: The model to use for generating the template
|
198
|
+
prompt: The base prompt to modify
|
199
|
+
few_shot_examples: List of example pairs with input and output fields
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
FewShotPromptTemplate containing the modified message list and example template
|
203
|
+
"""
|
163
204
|
"""
|
164
205
|
During this step we update the system prompt to include few-shot examples.
|
165
206
|
"""
|
166
207
|
user_message = {
|
167
|
-
"message_list": prompt.
|
168
|
-
"examples": few_shot_examples
|
208
|
+
"message_list": prompt.get_messages(),
|
209
|
+
"examples": few_shot_examples,
|
169
210
|
}
|
170
|
-
|
211
|
+
|
171
212
|
messages: List[Dict[str, str]] = [
|
172
213
|
{"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
|
173
214
|
{"role": "user", "content": json.dumps(user_message)},
|
174
215
|
]
|
175
|
-
|
216
|
+
|
176
217
|
logger.debug(f"fewshot_prompt_template - Calling LLM with: {messages}")
|
177
|
-
response = self._call_model(
|
178
|
-
model,
|
179
|
-
messages,
|
180
|
-
self.seed,
|
181
|
-
self.model_kwargs
|
182
|
-
)
|
218
|
+
response = self._call_model(model, messages, self.seed, self.model_kwargs)
|
183
219
|
logger.debug(f"fewshot_prompt_template - LLM response: {response}")
|
184
220
|
|
185
221
|
try:
|
186
222
|
res = utils.json_to_dict(response["choices"][0]["message"]["content"])
|
187
223
|
return FewShotPromptTemplate(
|
188
224
|
message_list_with_placeholder=res["message_list_with_placeholder"],
|
189
|
-
example_template=res["example_template"]
|
225
|
+
example_template=res["example_template"],
|
190
226
|
)
|
191
227
|
except Exception as e:
|
192
|
-
logger.error(
|
228
|
+
logger.error(
|
229
|
+
f"Failed to compute few-shot prompt template: {e} - response: {response}"
|
230
|
+
)
|
193
231
|
raise
|
194
232
|
|
195
233
|
def _run_optimization(
|
196
234
|
self,
|
235
|
+
prompt: chat_prompt.ChatPrompt,
|
197
236
|
fewshot_prompt_template: FewShotPromptTemplate,
|
198
237
|
dataset: Dataset,
|
199
238
|
metric: Callable,
|
239
|
+
baseline_score: float,
|
200
240
|
n_trials: int = 10,
|
201
|
-
baseline_score: Optional[float] = None,
|
202
241
|
optimization_id: Optional[str] = None,
|
203
242
|
experiment_config: Optional[Dict] = None,
|
204
243
|
n_samples: Optional[int] = None,
|
@@ -206,21 +245,22 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
206
245
|
reporting.start_optimization_run(verbose=self.verbose)
|
207
246
|
|
208
247
|
random.seed(self.seed)
|
209
|
-
|
210
|
-
|
248
|
+
|
211
249
|
# Load the dataset
|
212
250
|
dataset_items = dataset.get_items()
|
213
251
|
all_dataset_item_ids = [item["id"] for item in dataset_items]
|
214
252
|
eval_dataset_item_ids = all_dataset_item_ids
|
215
253
|
if n_samples is not None and n_samples < len(dataset_items):
|
216
254
|
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
217
|
-
|
255
|
+
|
218
256
|
# Define the experiment configuration
|
219
257
|
experiment_config = experiment_config or {}
|
220
258
|
base_experiment_config = { # Base config for reuse
|
221
259
|
**experiment_config,
|
222
260
|
**{
|
223
261
|
"optimizer": self.__class__.__name__,
|
262
|
+
"agent_class": self.agent_class.__name__,
|
263
|
+
"agent_config": prompt.to_dict(),
|
224
264
|
"metric": metric.__name__,
|
225
265
|
"dataset": dataset.name,
|
226
266
|
"configuration": {},
|
@@ -249,31 +289,44 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
249
289
|
for key, value in example.items():
|
250
290
|
processed_example[key] = str(value)
|
251
291
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
292
|
+
processed_demo_example = fewshot_prompt_template.example_template
|
293
|
+
for key, value in processed_example.items():
|
294
|
+
try:
|
295
|
+
processed_demo_example = processed_demo_example.replace(
|
296
|
+
f"{{{key}}}", str(value)
|
297
|
+
)
|
298
|
+
except Exception:
|
299
|
+
logger.error(
|
300
|
+
f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} "
|
301
|
+
)
|
302
|
+
raise
|
303
|
+
processed_demo_examples.append(processed_demo_example)
|
259
304
|
few_shot_examples = "\n\n".join(processed_demo_examples)
|
260
|
-
|
305
|
+
|
261
306
|
llm_task = self._build_task_from_messages(
|
307
|
+
prompt=prompt,
|
262
308
|
messages=fewshot_prompt_template.message_list_with_placeholder,
|
263
|
-
few_shot_examples=few_shot_examples
|
309
|
+
few_shot_examples=few_shot_examples,
|
264
310
|
)
|
265
311
|
|
266
|
-
messages_for_reporting = [
|
267
|
-
|
268
|
-
|
269
|
-
|
312
|
+
messages_for_reporting = [
|
313
|
+
{
|
314
|
+
"role": item["role"],
|
315
|
+
"content": item["content"].replace(
|
316
|
+
FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
|
317
|
+
),
|
318
|
+
}
|
319
|
+
for item in fewshot_prompt_template.message_list_with_placeholder
|
320
|
+
]
|
270
321
|
|
271
322
|
# Log trial config
|
272
323
|
trial_config = base_experiment_config.copy()
|
273
|
-
trial_config["configuration"]["prompt"] =
|
274
|
-
|
275
|
-
|
276
|
-
] =
|
324
|
+
trial_config["configuration"]["prompt"] = (
|
325
|
+
messages_for_reporting # Base instruction
|
326
|
+
)
|
327
|
+
trial_config["configuration"]["examples"] = (
|
328
|
+
processed_demo_examples # Log stringified examples
|
329
|
+
)
|
277
330
|
trial_config["configuration"]["n_examples"] = n_examples
|
278
331
|
trial_config["configuration"]["example_indices"] = example_indices
|
279
332
|
|
@@ -282,7 +335,9 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
282
335
|
)
|
283
336
|
logger.debug(f"Evaluating trial {trial.number}...")
|
284
337
|
|
285
|
-
with reporting.start_optimization_trial(
|
338
|
+
with reporting.start_optimization_trial(
|
339
|
+
trial.number, n_trials, verbose=self.verbose
|
340
|
+
) as trial_reporter:
|
286
341
|
trial_reporter.start_trial(messages_for_reporting)
|
287
342
|
score = task_evaluator.evaluate(
|
288
343
|
dataset=dataset,
|
@@ -290,7 +345,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
290
345
|
metric=metric,
|
291
346
|
evaluated_task=llm_task,
|
292
347
|
num_threads=self.n_threads,
|
293
|
-
project_name=self.project_name,
|
348
|
+
project_name=self.agent_class.project_name,
|
294
349
|
experiment_config=trial_config,
|
295
350
|
optimization_id=optimization_id,
|
296
351
|
verbose=self.verbose,
|
@@ -302,7 +357,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
302
357
|
trial_config = {
|
303
358
|
"demo_examples": demo_examples,
|
304
359
|
"message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
|
305
|
-
"message_list": messages_for_reporting
|
360
|
+
"message_list": messages_for_reporting,
|
306
361
|
}
|
307
362
|
trial.set_user_attr("score", score)
|
308
363
|
trial.set_user_attr("config", trial_config)
|
@@ -324,62 +379,89 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
324
379
|
# Explicitly create and seed the sampler for Optuna
|
325
380
|
sampler = optuna.samplers.TPESampler(seed=self.seed)
|
326
381
|
study = optuna.create_study(direction="maximize", sampler=sampler)
|
327
|
-
|
382
|
+
|
328
383
|
study.optimize(
|
329
|
-
optimization_objective,
|
330
|
-
n_trials=n_trials,
|
331
|
-
show_progress_bar=False
|
384
|
+
optimization_objective, n_trials=n_trials, show_progress_bar=False
|
332
385
|
)
|
333
|
-
|
386
|
+
|
334
387
|
optuna_history_processed = []
|
335
388
|
for trial_idx, trial in enumerate(study.trials):
|
336
389
|
if trial.state == optuna.trial.TrialState.COMPLETE:
|
337
390
|
trial_config = trial.user_attrs.get("config", {})
|
338
|
-
prompt_cand_display = trial_config.get(
|
339
|
-
|
340
|
-
|
391
|
+
prompt_cand_display = trial_config.get(
|
392
|
+
"message_list"
|
393
|
+
) # Default to None
|
394
|
+
|
395
|
+
score_val = (
|
396
|
+
trial.value
|
397
|
+
) # This can be None if trial failed to produce a score
|
341
398
|
duration_val = None
|
342
399
|
if trial.datetime_complete and trial.datetime_start:
|
343
|
-
duration_val = (
|
400
|
+
duration_val = (
|
401
|
+
trial.datetime_complete - trial.datetime_start
|
402
|
+
).total_seconds()
|
344
403
|
|
345
404
|
iter_detail = {
|
346
|
-
"iteration": trial.number + 1,
|
347
|
-
"timestamp":
|
405
|
+
"iteration": trial.number + 1,
|
406
|
+
"timestamp": (
|
407
|
+
trial.datetime_start.isoformat()
|
408
|
+
if trial.datetime_start
|
409
|
+
else datetime.now().isoformat()
|
410
|
+
),
|
348
411
|
"prompt_candidate": prompt_cand_display,
|
349
|
-
"parameters_used": {
|
350
|
-
"optuna_params": trial.user_attrs.get("config", {}),
|
351
|
-
"example_indices": trial.user_attrs.get(
|
412
|
+
"parameters_used": {
|
413
|
+
"optuna_params": trial.user_attrs.get("config", {}),
|
414
|
+
"example_indices": trial.user_attrs.get(
|
415
|
+
"example_indices", []
|
416
|
+
), # Default to empty list
|
352
417
|
},
|
353
|
-
"scores": [
|
354
|
-
|
355
|
-
|
356
|
-
|
418
|
+
"scores": [
|
419
|
+
{
|
420
|
+
"metric_name": metric.__name__,
|
421
|
+
"score": score_val, # Can be None
|
422
|
+
}
|
423
|
+
],
|
357
424
|
"duration_seconds": duration_val,
|
358
425
|
}
|
359
426
|
optuna_history_processed.append(iter_detail)
|
360
427
|
else:
|
361
|
-
logger.warning(
|
428
|
+
logger.warning(
|
429
|
+
f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}"
|
430
|
+
)
|
362
431
|
|
363
432
|
best_trial = study.best_trial
|
364
433
|
best_score = best_trial.value
|
365
434
|
best_example_indices = best_trial.user_attrs.get("example_indices", [])
|
366
435
|
|
436
|
+
if best_score <= baseline_score:
|
437
|
+
best_score = baseline_score
|
438
|
+
best_prompt = prompt.get_messages()
|
439
|
+
else:
|
440
|
+
best_prompt = best_trial.user_attrs["config"]["message_list"]
|
441
|
+
|
367
442
|
reporting.display_result(
|
368
443
|
initial_score=baseline_score,
|
369
444
|
best_score=best_score,
|
370
|
-
best_prompt=
|
371
|
-
verbose=self.verbose
|
445
|
+
best_prompt=best_prompt,
|
446
|
+
verbose=self.verbose,
|
372
447
|
)
|
373
448
|
|
374
449
|
return optimization_result.OptimizationResult(
|
375
450
|
optimizer=self.__class__.__name__,
|
376
|
-
prompt=
|
451
|
+
prompt=best_prompt,
|
452
|
+
initial_prompt=prompt.get_messages(),
|
453
|
+
initial_score=baseline_score,
|
377
454
|
score=best_score,
|
378
455
|
metric_name=metric.__name__,
|
379
456
|
details={
|
380
|
-
"
|
457
|
+
"initial_score": baseline_score,
|
458
|
+
"chat_messages": (
|
459
|
+
best_trial.user_attrs["config"]["message_list"]
|
460
|
+
if best_trial.user_attrs["config"]
|
461
|
+
else []
|
462
|
+
),
|
381
463
|
"prompt_parameter": best_trial.user_attrs["config"],
|
382
|
-
#"n_examples": best_n_examples,
|
464
|
+
# "n_examples": best_n_examples,
|
383
465
|
"example_indices": best_example_indices,
|
384
466
|
"trial_number": best_trial.number,
|
385
467
|
"total_trials": n_trials,
|
@@ -390,30 +472,54 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
390
472
|
"temperature": self.model_kwargs.get("temperature"),
|
391
473
|
},
|
392
474
|
history=optuna_history_processed,
|
393
|
-
llm_calls=self.llm_call_counter
|
475
|
+
llm_calls=self.llm_call_counter,
|
476
|
+
dataset_id=dataset.id,
|
477
|
+
optimization_id=optimization_id,
|
394
478
|
)
|
395
479
|
|
396
|
-
def optimize_prompt(
|
480
|
+
def optimize_prompt( # type: ignore
|
397
481
|
self,
|
398
482
|
prompt: chat_prompt.ChatPrompt,
|
399
483
|
dataset: Dataset,
|
400
484
|
metric: Callable,
|
401
485
|
n_trials: int = 10,
|
486
|
+
agent_class: Optional[Type[OptimizableAgent]] = None,
|
402
487
|
experiment_config: Optional[Dict] = None,
|
403
488
|
n_samples: Optional[int] = None,
|
404
489
|
) -> optimization_result.OptimizationResult:
|
405
490
|
"""
|
406
491
|
Args:
|
407
|
-
prompt:
|
492
|
+
prompt:
|
408
493
|
dataset: Opik Dataset to optimize on
|
409
494
|
metric: Metric function to evaluate on
|
410
495
|
n_trials: Number of trials for Bayesian Optimization
|
411
496
|
experiment_config: Optional configuration for the experiment, useful to log additional metadata
|
412
497
|
n_samples: Optional number of items to test in the dataset
|
413
|
-
|
498
|
+
|
414
499
|
Returns:
|
415
500
|
OptimizationResult: Result of the optimization
|
416
501
|
"""
|
502
|
+
if not isinstance(prompt, chat_prompt.ChatPrompt):
|
503
|
+
raise ValueError("Prompt must be a ChatPrompt object")
|
504
|
+
|
505
|
+
if not isinstance(dataset, Dataset):
|
506
|
+
raise ValueError("Dataset must be a Dataset object")
|
507
|
+
|
508
|
+
if not callable(metric):
|
509
|
+
raise ValueError(
|
510
|
+
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
511
|
+
)
|
512
|
+
|
513
|
+
if prompt.model is None:
|
514
|
+
prompt.model = self.model
|
515
|
+
if prompt.model_kwargs is None:
|
516
|
+
prompt.model_kwargs = self.model_kwargs
|
517
|
+
|
518
|
+
if agent_class is None:
|
519
|
+
self.agent_class = create_litellm_agent_class(prompt)
|
520
|
+
else:
|
521
|
+
self.agent_class = agent_class
|
522
|
+
|
417
523
|
optimization = None
|
418
524
|
try:
|
419
525
|
optimization = self._opik_client.create_optimization(
|
@@ -421,87 +527,95 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
421
527
|
objective_name=metric.__name__,
|
422
528
|
metadata={"optimizer": self.__class__.__name__},
|
423
529
|
)
|
530
|
+
optimization_run_id = optimization.id
|
424
531
|
except Exception:
|
425
532
|
logger.warning(
|
426
533
|
"Opik server does not support optimizations. Please upgrade opik."
|
427
534
|
)
|
428
535
|
optimization = None
|
536
|
+
optimization_run_id = None
|
429
537
|
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
with reporting.display_evaluation(message="First we will establish the baseline performance:", verbose=self.verbose) as eval_report:
|
448
|
-
baseline_score = self.evaluate_prompt(
|
449
|
-
prompt=prompt,
|
450
|
-
dataset=dataset,
|
451
|
-
metric=metric,
|
452
|
-
n_samples=n_samples,
|
453
|
-
optimization_id=optimization.id if optimization is not None else None
|
454
|
-
)
|
455
|
-
|
456
|
-
eval_report.set_score(baseline_score)
|
457
|
-
|
458
|
-
# Step 2. Create the few-shot prompt template
|
459
|
-
with reporting.creation_few_shot_prompt_template(verbose=self.verbose) as fewshot_template_report:
|
460
|
-
fewshot_template = self._create_fewshot_prompt_template(
|
461
|
-
model=self.model,
|
462
|
-
prompt=prompt,
|
463
|
-
few_shot_examples=[{k: v for k, v in item.items() if k != 'id'}
|
464
|
-
for item in dataset.get_items(nb_samples=10)]
|
465
|
-
)
|
538
|
+
# Start experiment reporting
|
539
|
+
reporting.display_header(
|
540
|
+
algorithm=self.__class__.__name__,
|
541
|
+
optimization_id=optimization_run_id,
|
542
|
+
dataset_id=dataset.id,
|
543
|
+
verbose=self.verbose,
|
544
|
+
)
|
545
|
+
reporting.display_configuration(
|
546
|
+
prompt.get_messages(),
|
547
|
+
optimizer_config={
|
548
|
+
"optimizer": self.__class__.__name__,
|
549
|
+
"metric": metric.__name__,
|
550
|
+
"n_trials": n_trials,
|
551
|
+
"n_samples": n_samples,
|
552
|
+
},
|
553
|
+
verbose=self.verbose,
|
554
|
+
)
|
466
555
|
|
467
|
-
|
556
|
+
utils.disable_experiment_reporting()
|
468
557
|
|
469
|
-
|
470
|
-
|
471
|
-
|
558
|
+
# Step 1. Compute the baseline evaluation
|
559
|
+
with reporting.display_evaluation(
|
560
|
+
message="First we will establish the baseline performance:",
|
561
|
+
verbose=self.verbose,
|
562
|
+
) as eval_report:
|
563
|
+
baseline_score = self._evaluate_prompt(
|
564
|
+
prompt,
|
472
565
|
dataset=dataset,
|
473
566
|
metric=metric,
|
474
|
-
optimization_id=optimization.id if optimization is not None else None,
|
475
|
-
experiment_config=experiment_config,
|
476
|
-
n_trials=n_trials,
|
477
|
-
baseline_score=baseline_score,
|
478
567
|
n_samples=n_samples,
|
568
|
+
optimization_id=(optimization.id if optimization is not None else None),
|
479
569
|
)
|
480
|
-
if optimization:
|
481
|
-
self.update_optimization(optimization, status="completed")
|
482
570
|
|
483
|
-
|
484
|
-
return result
|
485
|
-
except Exception as e:
|
486
|
-
if optimization:
|
487
|
-
self.update_optimization(optimization, status="cancelled")
|
488
|
-
logger.error(f"FewShotBayesian optimization failed: {e}", exc_info=True)
|
489
|
-
utils.enable_experiment_reporting()
|
490
|
-
raise e
|
571
|
+
eval_report.set_score(baseline_score)
|
491
572
|
|
492
|
-
|
573
|
+
# Step 2. Create the few-shot prompt template
|
574
|
+
with reporting.creation_few_shot_prompt_template(
|
575
|
+
verbose=self.verbose
|
576
|
+
) as fewshot_template_report:
|
577
|
+
fewshot_template = self._create_fewshot_prompt_template(
|
578
|
+
model=self.model,
|
579
|
+
prompt=prompt,
|
580
|
+
few_shot_examples=[
|
581
|
+
{k: v for k, v in item.items() if k != "id"}
|
582
|
+
for item in dataset.get_items(nb_samples=10)
|
583
|
+
],
|
584
|
+
)
|
585
|
+
|
586
|
+
fewshot_template_report.set_fewshot_template(fewshot_template)
|
587
|
+
|
588
|
+
# Step 3. Start the optimization process
|
589
|
+
result = self._run_optimization(
|
590
|
+
prompt=prompt,
|
591
|
+
fewshot_prompt_template=fewshot_template,
|
592
|
+
dataset=dataset,
|
593
|
+
metric=metric,
|
594
|
+
baseline_score=baseline_score,
|
595
|
+
optimization_id=optimization.id if optimization is not None else None,
|
596
|
+
experiment_config=experiment_config,
|
597
|
+
n_trials=n_trials,
|
598
|
+
n_samples=n_samples,
|
599
|
+
)
|
600
|
+
if optimization:
|
601
|
+
self.update_optimization(optimization, status="completed")
|
602
|
+
|
603
|
+
utils.enable_experiment_reporting()
|
604
|
+
return result
|
605
|
+
|
606
|
+
def _evaluate_prompt(
|
493
607
|
self,
|
494
608
|
prompt: chat_prompt.ChatPrompt,
|
495
609
|
dataset: opik.Dataset,
|
496
610
|
metric: Callable,
|
611
|
+
n_samples: Optional[int] = None,
|
497
612
|
dataset_item_ids: Optional[List[str]] = None,
|
498
613
|
experiment_config: Optional[Dict] = None,
|
499
614
|
optimization_id: Optional[str] = None,
|
500
|
-
|
615
|
+
**kwargs: Any,
|
501
616
|
) -> float:
|
502
617
|
"""
|
503
618
|
Args:
|
504
|
-
prompt: The prompt to evaluate
|
505
619
|
dataset: Opik Dataset to evaluate the prompt on
|
506
620
|
metric: Metric function to evaluate on, should have the arguments `dataset_item` and `llm_output`
|
507
621
|
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
@@ -511,27 +625,19 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
511
625
|
Returns:
|
512
626
|
float: The evaluation score
|
513
627
|
"""
|
514
|
-
|
515
|
-
if not all(
|
516
|
-
isinstance(item, dict) and "role" in item and "content" in item
|
517
|
-
for item in prompt.formatted_messages
|
518
|
-
):
|
519
|
-
raise ValueError(
|
520
|
-
"A ChatPrompt must be a list of dictionaries with 'role' and 'content' keys."
|
521
|
-
)
|
522
|
-
|
523
|
-
llm_task = self._build_task_from_messages(prompt.formatted_messages)
|
628
|
+
llm_task = self._build_task_from_messages(prompt, prompt.get_messages())
|
524
629
|
|
525
630
|
experiment_config = experiment_config or {}
|
631
|
+
experiment_config["project_name"] = self.agent_class.__name__
|
526
632
|
experiment_config = {
|
527
633
|
**experiment_config,
|
528
634
|
**{
|
529
635
|
"optimizer": self.__class__.__name__,
|
636
|
+
"agent_class": self.agent_class.__name__,
|
637
|
+
"agent_config": prompt.to_dict(),
|
530
638
|
"metric": metric.__name__,
|
531
639
|
"dataset": dataset.name,
|
532
|
-
"configuration": {
|
533
|
-
"prompt": prompt.formatted_messages,
|
534
|
-
},
|
640
|
+
"configuration": {"prompt": prompt.get_messages()},
|
535
641
|
},
|
536
642
|
}
|
537
643
|
|
@@ -549,7 +655,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
549
655
|
metric=metric,
|
550
656
|
evaluated_task=llm_task,
|
551
657
|
num_threads=self.n_threads,
|
552
|
-
project_name=self.project_name,
|
658
|
+
project_name=self.agent_class.project_name,
|
553
659
|
experiment_config=experiment_config,
|
554
660
|
optimization_id=optimization_id,
|
555
661
|
verbose=self.verbose,
|
@@ -558,32 +664,36 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
558
664
|
|
559
665
|
return score
|
560
666
|
|
561
|
-
|
562
667
|
def _build_task_from_messages(
|
563
|
-
self,
|
564
|
-
|
668
|
+
self,
|
669
|
+
prompt: chat_prompt.ChatPrompt,
|
670
|
+
messages: List[Dict[str, str]],
|
671
|
+
few_shot_examples: Optional[str] = None,
|
672
|
+
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
|
673
|
+
new_prompt = prompt.copy()
|
674
|
+
new_prompt.set_messages(messages)
|
675
|
+
agent = self.agent_class(new_prompt)
|
676
|
+
|
565
677
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
678
|
+
"""
|
679
|
+
Process a single dataset item through the LLM task.
|
680
|
+
|
681
|
+
Args:
|
682
|
+
dataset_item: Dictionary containing the dataset item data
|
683
|
+
|
684
|
+
Returns:
|
685
|
+
Dictionary containing the LLM's response
|
686
|
+
"""
|
687
|
+
messages = new_prompt.get_messages(dataset_item)
|
571
688
|
|
572
689
|
if few_shot_examples:
|
573
|
-
|
574
|
-
"
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
response = self._call_model(
|
579
|
-
model=self.model,
|
580
|
-
messages=prompt_,
|
581
|
-
seed=self.seed,
|
582
|
-
model_kwargs=self.model_kwargs
|
583
|
-
)
|
690
|
+
for message in messages:
|
691
|
+
message["content"] = message["content"].replace(
|
692
|
+
FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples
|
693
|
+
)
|
584
694
|
|
585
|
-
|
586
|
-
|
587
|
-
}
|
695
|
+
result = agent.invoke(messages, seed=self.seed)
|
696
|
+
|
697
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: result}
|
588
698
|
|
589
699
|
return llm_task
|