opik-optimizer 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +65 -0
- opik_optimizer/_throttle.py +43 -0
- opik_optimizer/base_optimizer.py +240 -0
- opik_optimizer/cache_config.py +24 -0
- opik_optimizer/demo/__init__.py +7 -0
- opik_optimizer/demo/cache.py +112 -0
- opik_optimizer/demo/datasets.py +656 -0
- opik_optimizer/few_shot_bayesian_optimizer/__init__.py +5 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +408 -0
- opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +91 -0
- opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +80 -0
- opik_optimizer/integrations/__init__.py +0 -0
- opik_optimizer/logging_config.py +69 -0
- opik_optimizer/meta_prompt_optimizer.py +1100 -0
- opik_optimizer/mipro_optimizer/__init__.py +1 -0
- opik_optimizer/mipro_optimizer/_lm.py +394 -0
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +1058 -0
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +395 -0
- opik_optimizer/mipro_optimizer/utils.py +107 -0
- opik_optimizer/optimization_config/__init__.py +0 -0
- opik_optimizer/optimization_config/configs.py +35 -0
- opik_optimizer/optimization_config/mappers.py +49 -0
- opik_optimizer/optimization_result.py +211 -0
- opik_optimizer/task_evaluator.py +102 -0
- opik_optimizer/utils.py +132 -0
- opik_optimizer-0.7.0.dist-info/METADATA +35 -0
- opik_optimizer-0.7.0.dist-info/RECORD +30 -0
- opik_optimizer-0.7.0.dist-info/WHEEL +5 -0
- opik_optimizer-0.7.0.dist-info/licenses/LICENSE +21 -0
- opik_optimizer-0.7.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1058 @@
|
|
1
|
+
import logging
|
2
|
+
import random
|
3
|
+
import textwrap
|
4
|
+
from collections import defaultdict
|
5
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import opik
|
9
|
+
import optuna
|
10
|
+
from optuna.distributions import CategoricalDistribution
|
11
|
+
|
12
|
+
import dspy
|
13
|
+
from dspy.evaluate.evaluate import Evaluate
|
14
|
+
from dspy.propose import GroundedProposer
|
15
|
+
from dspy.teleprompt.teleprompt import Teleprompter
|
16
|
+
from dspy.teleprompt.utils import (
|
17
|
+
create_minibatch,
|
18
|
+
create_n_fewshot_demo_sets,
|
19
|
+
eval_candidate_program,
|
20
|
+
get_program_with_highest_avg_score,
|
21
|
+
get_signature,
|
22
|
+
print_full_program,
|
23
|
+
save_candidate_program,
|
24
|
+
set_signature,
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
class Logger():
|
29
|
+
def info(self, *args, **kwargs):
|
30
|
+
print(*args)
|
31
|
+
|
32
|
+
logger = Logger()
|
33
|
+
|
34
|
+
# Constants
|
35
|
+
BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT = 3
|
36
|
+
LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT = 0
|
37
|
+
MIN_MINIBATCH_SIZE = 50
|
38
|
+
|
39
|
+
AUTO_RUN_SETTINGS = {
|
40
|
+
"light": {"num_trials": 7, "val_size": 100},
|
41
|
+
"medium": {"num_trials": 25, "val_size": 300},
|
42
|
+
"heavy": {"num_trials": 50, "val_size": 1000},
|
43
|
+
}
|
44
|
+
|
45
|
+
# ANSI escape codes for colors
|
46
|
+
YELLOW = "\033[93m"
|
47
|
+
GREEN = "\033[92m"
|
48
|
+
BLUE = "\033[94m"
|
49
|
+
BOLD = "\033[1m"
|
50
|
+
ENDC = "\033[0m" # Resets the color to default
|
51
|
+
|
52
|
+
import opik
|
53
|
+
from opik_optimizer import task_evaluator
|
54
|
+
from opik_optimizer.optimization_config.configs import MetricConfig, TaskConfig
|
55
|
+
from opik_optimizer.optimization_config import mappers
|
56
|
+
|
57
|
+
|
58
|
+
class MIPROv2(Teleprompter):
|
59
|
+
def __init__(
|
60
|
+
self,
|
61
|
+
metric: Callable,
|
62
|
+
prompt_model: Optional[Any] = None,
|
63
|
+
task_model: Optional[Any] = None,
|
64
|
+
teacher_settings: Dict = {},
|
65
|
+
max_bootstrapped_demos: int = 4,
|
66
|
+
max_labeled_demos: int = 4,
|
67
|
+
auto: Optional[Literal["light", "medium", "heavy"]] = "medium",
|
68
|
+
num_candidates: int = 10,
|
69
|
+
num_threads: Optional[int] = None,
|
70
|
+
max_errors: int = 10,
|
71
|
+
seed: int = 9,
|
72
|
+
init_temperature: float = 0.5,
|
73
|
+
verbose: bool = False,
|
74
|
+
track_stats: bool = True,
|
75
|
+
log_dir: Optional[str] = None,
|
76
|
+
metric_threshold: Optional[float] = None,
|
77
|
+
opik_dataset: Optional[opik.Dataset] = None,
|
78
|
+
opik_metric_config: Optional[MetricConfig] = None,
|
79
|
+
opik_prompt_task_config: Optional[TaskConfig] = None,
|
80
|
+
opik_project_name: Optional[str] = None,
|
81
|
+
opik_optimization_id: Optional[str] = None,
|
82
|
+
experiment_config: Optional[Dict[str, Any]] = None,
|
83
|
+
):
|
84
|
+
# Validate 'auto' parameter
|
85
|
+
allowed_modes = {None, "light", "medium", "heavy"}
|
86
|
+
if auto not in allowed_modes:
|
87
|
+
raise ValueError(
|
88
|
+
f"Invalid value for auto: {auto}. Must be one of {allowed_modes}."
|
89
|
+
)
|
90
|
+
self.auto = auto
|
91
|
+
|
92
|
+
self.num_candidates = num_candidates
|
93
|
+
self.metric = metric
|
94
|
+
self.init_temperature = init_temperature
|
95
|
+
self.task_model = task_model if task_model else dspy.settings.lm
|
96
|
+
self.prompt_model = prompt_model if prompt_model else dspy.settings.lm
|
97
|
+
self.max_bootstrapped_demos = max_bootstrapped_demos
|
98
|
+
self.max_labeled_demos = max_labeled_demos
|
99
|
+
self.verbose = verbose
|
100
|
+
self.track_stats = track_stats
|
101
|
+
self.log_dir = log_dir
|
102
|
+
self.teacher_settings = teacher_settings
|
103
|
+
self.prompt_model_total_calls = 0
|
104
|
+
self.total_calls = 0
|
105
|
+
self.num_threads = num_threads
|
106
|
+
self.max_errors = max_errors
|
107
|
+
self.metric_threshold = metric_threshold
|
108
|
+
self.seed = seed
|
109
|
+
self.rng = None
|
110
|
+
|
111
|
+
self.opik_dataset = opik_dataset
|
112
|
+
self.opik_metric_config = opik_metric_config
|
113
|
+
self.opik_prompt_task_config = opik_prompt_task_config
|
114
|
+
self.opik_project_name = opik_project_name
|
115
|
+
self.opik_optimization_id = opik_optimization_id
|
116
|
+
self.experiment_config = experiment_config or {}
|
117
|
+
|
118
|
+
def compile(
|
119
|
+
self,
|
120
|
+
student: Any,
|
121
|
+
*,
|
122
|
+
trainset: List,
|
123
|
+
teacher: Any = None,
|
124
|
+
valset: Optional[List] = None,
|
125
|
+
num_trials: int = 30,
|
126
|
+
max_bootstrapped_demos: Optional[int] = None,
|
127
|
+
max_labeled_demos: Optional[int] = None,
|
128
|
+
seed: Optional[int] = None,
|
129
|
+
minibatch: bool = True,
|
130
|
+
minibatch_size: int = 35,
|
131
|
+
minibatch_full_eval_steps: int = 5,
|
132
|
+
program_aware_proposer: bool = True,
|
133
|
+
data_aware_proposer: bool = True,
|
134
|
+
view_data_batch_size: int = 10,
|
135
|
+
tip_aware_proposer: bool = True,
|
136
|
+
fewshot_aware_proposer: bool = True,
|
137
|
+
requires_permission_to_run: bool = True,
|
138
|
+
provide_traceback: Optional[bool] = None,
|
139
|
+
) -> Any:
|
140
|
+
# Set random seeds
|
141
|
+
seed = seed or self.seed
|
142
|
+
self._set_random_seeds(seed)
|
143
|
+
|
144
|
+
# Update max demos if specified
|
145
|
+
if max_bootstrapped_demos is not None:
|
146
|
+
self.max_bootstrapped_demos = max_bootstrapped_demos
|
147
|
+
if max_labeled_demos is not None:
|
148
|
+
self.max_labeled_demos = max_labeled_demos
|
149
|
+
|
150
|
+
# Set training & validation sets
|
151
|
+
trainset, valset = self._set_and_validate_datasets(trainset, valset)
|
152
|
+
|
153
|
+
# Set hyperparameters based on run mode (if set)
|
154
|
+
zeroshot_opt = (self.max_bootstrapped_demos == 0) and (
|
155
|
+
self.max_labeled_demos == 0
|
156
|
+
)
|
157
|
+
num_trials, valset, minibatch = self._set_hyperparams_from_run_mode(
|
158
|
+
student, num_trials, minibatch, zeroshot_opt, valset
|
159
|
+
)
|
160
|
+
|
161
|
+
if self.auto:
|
162
|
+
self._print_auto_run_settings(num_trials, minibatch, valset)
|
163
|
+
|
164
|
+
if minibatch and minibatch_size > len(valset):
|
165
|
+
raise ValueError(
|
166
|
+
f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}."
|
167
|
+
)
|
168
|
+
|
169
|
+
# Estimate LM calls and get user confirmation
|
170
|
+
if requires_permission_to_run:
|
171
|
+
if not self._get_user_confirmation(
|
172
|
+
student,
|
173
|
+
num_trials,
|
174
|
+
minibatch,
|
175
|
+
minibatch_size,
|
176
|
+
minibatch_full_eval_steps,
|
177
|
+
valset,
|
178
|
+
program_aware_proposer,
|
179
|
+
):
|
180
|
+
logger.info("Compilation aborted by the user.")
|
181
|
+
return student # Return the original student program
|
182
|
+
|
183
|
+
# Initialize program and evaluator
|
184
|
+
program = student.deepcopy()
|
185
|
+
evaluate = Evaluate(
|
186
|
+
devset=valset,
|
187
|
+
metric=self.metric,
|
188
|
+
num_threads=self.num_threads,
|
189
|
+
max_errors=self.max_errors,
|
190
|
+
display_table=False,
|
191
|
+
display_progress=True,
|
192
|
+
provide_traceback=provide_traceback,
|
193
|
+
)
|
194
|
+
|
195
|
+
# Step 1: Bootstrap few-shot examples
|
196
|
+
demo_candidates = self._bootstrap_fewshot_examples(
|
197
|
+
program, trainset, seed, teacher
|
198
|
+
)
|
199
|
+
|
200
|
+
# Step 2: Propose instruction candidates
|
201
|
+
try:
|
202
|
+
instruction_candidates = self._propose_instructions(
|
203
|
+
program,
|
204
|
+
trainset,
|
205
|
+
demo_candidates,
|
206
|
+
view_data_batch_size,
|
207
|
+
program_aware_proposer,
|
208
|
+
data_aware_proposer,
|
209
|
+
tip_aware_proposer,
|
210
|
+
fewshot_aware_proposer,
|
211
|
+
)
|
212
|
+
except RuntimeError:
|
213
|
+
raise Exception("Make sure you have provider API key set") from None
|
214
|
+
|
215
|
+
# If zero-shot, discard demos
|
216
|
+
if zeroshot_opt:
|
217
|
+
demo_candidates = None
|
218
|
+
|
219
|
+
# Step 3: Find optimal prompt parameters
|
220
|
+
best_program = self._optimize_prompt_parameters(
|
221
|
+
program,
|
222
|
+
instruction_candidates,
|
223
|
+
demo_candidates,
|
224
|
+
evaluate,
|
225
|
+
valset,
|
226
|
+
num_trials,
|
227
|
+
minibatch,
|
228
|
+
minibatch_size,
|
229
|
+
minibatch_full_eval_steps,
|
230
|
+
seed,
|
231
|
+
)
|
232
|
+
|
233
|
+
return best_program
|
234
|
+
|
235
|
+
def _set_random_seeds(self, seed):
|
236
|
+
self.rng = random.Random(seed)
|
237
|
+
np.random.seed(seed)
|
238
|
+
|
239
|
+
def _set_hyperparams_from_run_mode(
|
240
|
+
self,
|
241
|
+
program: Any,
|
242
|
+
num_trials: int,
|
243
|
+
minibatch: bool,
|
244
|
+
zeroshot_opt: bool,
|
245
|
+
valset: List,
|
246
|
+
) -> Tuple[int, List, bool]:
|
247
|
+
if self.auto is None:
|
248
|
+
return num_trials, valset, minibatch
|
249
|
+
|
250
|
+
num_vars = len(program.predictors())
|
251
|
+
if not zeroshot_opt:
|
252
|
+
num_vars *= 2 # Account for few-shot examples + instruction variables
|
253
|
+
|
254
|
+
auto_settings = AUTO_RUN_SETTINGS[self.auto]
|
255
|
+
num_trials = auto_settings["num_trials"]
|
256
|
+
valset = create_minibatch(
|
257
|
+
valset, batch_size=auto_settings["val_size"], rng=self.rng
|
258
|
+
)
|
259
|
+
minibatch = len(valset) > MIN_MINIBATCH_SIZE
|
260
|
+
self.num_candidates = int(
|
261
|
+
np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars]))
|
262
|
+
)
|
263
|
+
|
264
|
+
return num_trials, valset, minibatch
|
265
|
+
|
266
|
+
def _set_and_validate_datasets(self, trainset: List, valset: Optional[List]):
|
267
|
+
if not trainset:
|
268
|
+
raise ValueError("Trainset cannot be empty.")
|
269
|
+
|
270
|
+
if valset is None:
|
271
|
+
if len(trainset) < 2:
|
272
|
+
raise ValueError(
|
273
|
+
"Trainset must have at least 2 examples if no valset specified."
|
274
|
+
)
|
275
|
+
valset_size = min(1000, max(1, int(len(trainset) * 0.80)))
|
276
|
+
cutoff = len(trainset) - valset_size
|
277
|
+
valset = trainset[cutoff:]
|
278
|
+
trainset = trainset[:cutoff]
|
279
|
+
else:
|
280
|
+
if len(valset) < 1:
|
281
|
+
raise ValueError("Validation set must have at least 1 example.")
|
282
|
+
|
283
|
+
return trainset, valset
|
284
|
+
|
285
|
+
def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset: List):
|
286
|
+
logger.info(
|
287
|
+
f"\nRUNNING WITH THE FOLLOWING {self.auto.upper()} AUTO RUN SETTINGS:"
|
288
|
+
f"\nnum_trials: {num_trials}"
|
289
|
+
f"\nminibatch: {minibatch}"
|
290
|
+
f"\nnum_candidates: {self.num_candidates}"
|
291
|
+
f"\nvalset size: {len(valset)}\n"
|
292
|
+
)
|
293
|
+
|
294
|
+
def _estimate_lm_calls(
|
295
|
+
self,
|
296
|
+
program: Any,
|
297
|
+
num_trials: int,
|
298
|
+
minibatch: bool,
|
299
|
+
minibatch_size: int,
|
300
|
+
minibatch_full_eval_steps: int,
|
301
|
+
valset: List,
|
302
|
+
program_aware_proposer: bool,
|
303
|
+
) -> Tuple[str, str]:
|
304
|
+
num_predictors = len(program.predictors())
|
305
|
+
|
306
|
+
# Estimate prompt model calls
|
307
|
+
estimated_prompt_model_calls = (
|
308
|
+
10 # Data summarizer calls
|
309
|
+
+ self.num_candidates * num_predictors # Candidate generation
|
310
|
+
+ (
|
311
|
+
num_predictors + 1 if program_aware_proposer else 0
|
312
|
+
) # Program-aware proposer
|
313
|
+
)
|
314
|
+
prompt_model_line = (
|
315
|
+
f"{YELLOW}- Prompt Generation: {BLUE}{BOLD}10{ENDC}{YELLOW} data summarizer calls + "
|
316
|
+
f"{BLUE}{BOLD}{self.num_candidates}{ENDC}{YELLOW} * "
|
317
|
+
f"{BLUE}{BOLD}{num_predictors}{ENDC}{YELLOW} lm calls in program "
|
318
|
+
f"+ ({BLUE}{BOLD}{num_predictors + 1}{ENDC}{YELLOW}) lm calls in program-aware proposer "
|
319
|
+
f"= {BLUE}{BOLD}{estimated_prompt_model_calls}{ENDC}{YELLOW} prompt model calls{ENDC}"
|
320
|
+
)
|
321
|
+
|
322
|
+
# Estimate task model calls
|
323
|
+
if not minibatch:
|
324
|
+
estimated_task_model_calls = len(valset) * num_trials
|
325
|
+
task_model_line = (
|
326
|
+
f"{YELLOW}- Program Evaluation: {BLUE}{BOLD}{len(valset)}{ENDC}{YELLOW} examples in val set * "
|
327
|
+
f"{BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} batches = "
|
328
|
+
f"{BLUE}{BOLD}{estimated_task_model_calls}{ENDC}{YELLOW} LM program calls{ENDC}"
|
329
|
+
)
|
330
|
+
else:
|
331
|
+
full_eval_steps = num_trials // minibatch_full_eval_steps + 1
|
332
|
+
estimated_task_model_calls = (
|
333
|
+
minibatch_size * num_trials + len(valset) * full_eval_steps
|
334
|
+
)
|
335
|
+
task_model_line = (
|
336
|
+
f"{YELLOW}- Program Evaluation: {BLUE}{BOLD}{minibatch_size}{ENDC}{YELLOW} examples in minibatch * "
|
337
|
+
f"{BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} batches + "
|
338
|
+
f"{BLUE}{BOLD}{len(valset)}{ENDC}{YELLOW} examples in val set * "
|
339
|
+
f"{BLUE}{BOLD}{full_eval_steps}{ENDC}{YELLOW} full evals = "
|
340
|
+
f"{BLUE}{BOLD}{estimated_task_model_calls}{ENDC}{YELLOW} LM Program calls{ENDC}"
|
341
|
+
)
|
342
|
+
|
343
|
+
return prompt_model_line, task_model_line
|
344
|
+
|
345
|
+
def _get_user_confirmation(
|
346
|
+
self,
|
347
|
+
program: Any,
|
348
|
+
num_trials: int,
|
349
|
+
minibatch: bool,
|
350
|
+
minibatch_size: int,
|
351
|
+
minibatch_full_eval_steps: int,
|
352
|
+
valset: List,
|
353
|
+
program_aware_proposer: bool,
|
354
|
+
) -> bool:
|
355
|
+
prompt_model_line, task_model_line = self._estimate_lm_calls(
|
356
|
+
program,
|
357
|
+
num_trials,
|
358
|
+
minibatch,
|
359
|
+
minibatch_size,
|
360
|
+
minibatch_full_eval_steps,
|
361
|
+
valset,
|
362
|
+
program_aware_proposer,
|
363
|
+
)
|
364
|
+
|
365
|
+
user_message = textwrap.dedent(
|
366
|
+
f"""\
|
367
|
+
{YELLOW}{BOLD}Projected Language Model (LM) Calls{ENDC}
|
368
|
+
|
369
|
+
Based on the parameters you have set, the maximum number of LM calls is projected as follows:
|
370
|
+
|
371
|
+
{prompt_model_line}
|
372
|
+
{task_model_line}
|
373
|
+
|
374
|
+
{YELLOW}{BOLD}Estimated Cost Calculation:{ENDC}
|
375
|
+
|
376
|
+
{YELLOW}Total Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token)
|
377
|
+
+ (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).{ENDC}
|
378
|
+
|
379
|
+
For a preliminary estimate of potential costs, we recommend you perform your own calculations based on the task
|
380
|
+
and prompt models you intend to use. If the projected costs exceed your budget or expectations, you may consider:
|
381
|
+
|
382
|
+
{YELLOW}- Reducing the number of trials (`num_trials`), the size of the valset, or the number of LM calls in your program.{ENDC}
|
383
|
+
{YELLOW}- Using a cheaper task model to optimize the prompt.{ENDC}
|
384
|
+
{YELLOW}- Setting `minibatch=True` if you haven't already.{ENDC}\n"""
|
385
|
+
)
|
386
|
+
|
387
|
+
user_confirmation_message = textwrap.dedent(
|
388
|
+
f"""\
|
389
|
+
To proceed with the execution of this program, please confirm by typing {BLUE}'y'{ENDC} for yes or {BLUE}'n'{ENDC} for no.
|
390
|
+
|
391
|
+
If you would like to bypass this confirmation step in future executions, set the {YELLOW}`requires_permission_to_run`{ENDC} flag to {YELLOW}`False`{ENDC} when calling compile.
|
392
|
+
|
393
|
+
{YELLOW}Awaiting your input...{ENDC}
|
394
|
+
"""
|
395
|
+
)
|
396
|
+
|
397
|
+
user_input = (
|
398
|
+
input(
|
399
|
+
f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): "
|
400
|
+
)
|
401
|
+
.strip()
|
402
|
+
.lower()
|
403
|
+
)
|
404
|
+
return user_input == "y"
|
405
|
+
|
406
|
+
def _bootstrap_fewshot_examples(
|
407
|
+
self, program: Any, trainset: List, seed: int, teacher: Any
|
408
|
+
) -> Optional[List]:
|
409
|
+
logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==")
|
410
|
+
if self.max_bootstrapped_demos > 0:
|
411
|
+
logger.info(
|
412
|
+
"These will be used as few-shot example candidates for our program and for creating instructions.\n"
|
413
|
+
)
|
414
|
+
else:
|
415
|
+
logger.info("These will be used for informing instruction proposal.\n")
|
416
|
+
|
417
|
+
logger.info(f"Bootstrapping N={self.num_candidates} sets of demonstrations...")
|
418
|
+
|
419
|
+
zeroshot = self.max_bootstrapped_demos == 0 and self.max_labeled_demos == 0
|
420
|
+
|
421
|
+
try:
|
422
|
+
demo_candidates = create_n_fewshot_demo_sets(
|
423
|
+
student=program,
|
424
|
+
num_candidate_sets=self.num_candidates,
|
425
|
+
trainset=trainset,
|
426
|
+
max_labeled_demos=(
|
427
|
+
LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT
|
428
|
+
if zeroshot
|
429
|
+
else self.max_labeled_demos
|
430
|
+
),
|
431
|
+
max_bootstrapped_demos=(
|
432
|
+
BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT
|
433
|
+
if zeroshot
|
434
|
+
else self.max_bootstrapped_demos
|
435
|
+
),
|
436
|
+
metric=self.metric,
|
437
|
+
max_errors=self.max_errors,
|
438
|
+
teacher=teacher,
|
439
|
+
teacher_settings=self.teacher_settings,
|
440
|
+
seed=seed,
|
441
|
+
metric_threshold=self.metric_threshold,
|
442
|
+
rng=self.rng,
|
443
|
+
)
|
444
|
+
except Exception as e:
|
445
|
+
logger.info(f"Error generating few-shot examples: {e}")
|
446
|
+
logger.info("Running without few-shot examples.")
|
447
|
+
demo_candidates = None
|
448
|
+
|
449
|
+
return demo_candidates
|
450
|
+
|
451
|
+
def _propose_instructions(
|
452
|
+
self,
|
453
|
+
program: Any,
|
454
|
+
trainset: List,
|
455
|
+
demo_candidates: Optional[List],
|
456
|
+
view_data_batch_size: int,
|
457
|
+
program_aware_proposer: bool,
|
458
|
+
data_aware_proposer: bool,
|
459
|
+
tip_aware_proposer: bool,
|
460
|
+
fewshot_aware_proposer: bool,
|
461
|
+
) -> Dict[int, List[str]]:
|
462
|
+
logger.info("\n==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==")
|
463
|
+
logger.info(
|
464
|
+
"We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions."
|
465
|
+
)
|
466
|
+
|
467
|
+
proposer = GroundedProposer(
|
468
|
+
program=program,
|
469
|
+
trainset=trainset,
|
470
|
+
prompt_model=self.prompt_model,
|
471
|
+
view_data_batch_size=view_data_batch_size,
|
472
|
+
program_aware=program_aware_proposer,
|
473
|
+
use_dataset_summary=data_aware_proposer,
|
474
|
+
use_task_demos=fewshot_aware_proposer,
|
475
|
+
num_demos_in_context=BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT,
|
476
|
+
use_tip=tip_aware_proposer,
|
477
|
+
set_tip_randomly=tip_aware_proposer,
|
478
|
+
use_instruct_history=False,
|
479
|
+
set_history_randomly=False,
|
480
|
+
verbose=self.verbose,
|
481
|
+
rng=self.rng,
|
482
|
+
)
|
483
|
+
|
484
|
+
logger.info("\nProposing instructions...\n")
|
485
|
+
instruction_candidates = proposer.propose_instructions_for_program(
|
486
|
+
trainset=trainset,
|
487
|
+
program=program,
|
488
|
+
demo_candidates=demo_candidates,
|
489
|
+
N=self.num_candidates,
|
490
|
+
T=self.init_temperature,
|
491
|
+
trial_logs={},
|
492
|
+
)
|
493
|
+
|
494
|
+
for i, pred in enumerate(program.predictors()):
|
495
|
+
logger.info(f"Proposed Instructions for Predictor {i}:\n")
|
496
|
+
instruction_candidates[i][0] = get_signature(pred).instructions
|
497
|
+
for j, instruction in enumerate(instruction_candidates[i]):
|
498
|
+
logger.info(f"{j}: {instruction}\n")
|
499
|
+
logger.info("\n")
|
500
|
+
|
501
|
+
return instruction_candidates
|
502
|
+
|
503
|
+
def _optimize_prompt_parameters(
|
504
|
+
self,
|
505
|
+
program: Any,
|
506
|
+
instruction_candidates: Dict[int, List[str]],
|
507
|
+
demo_candidates: Optional[List],
|
508
|
+
evaluate: Evaluate,
|
509
|
+
valset: List,
|
510
|
+
num_trials: int,
|
511
|
+
minibatch: bool,
|
512
|
+
minibatch_size: int,
|
513
|
+
minibatch_full_eval_steps: int,
|
514
|
+
seed: int,
|
515
|
+
) -> Optional[Any]:
|
516
|
+
# Run optimization
|
517
|
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
518
|
+
logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
|
519
|
+
logger.info(
|
520
|
+
"We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n"
|
521
|
+
)
|
522
|
+
|
523
|
+
# Compute the adjusted total trials that we will run (including full evals)
|
524
|
+
run_additional_full_eval_at_end = (
|
525
|
+
1 if num_trials % minibatch_full_eval_steps != 0 else 0
|
526
|
+
)
|
527
|
+
adjusted_num_trials = (
|
528
|
+
(
|
529
|
+
num_trials
|
530
|
+
+ num_trials // minibatch_full_eval_steps
|
531
|
+
+ 1
|
532
|
+
+ run_additional_full_eval_at_end
|
533
|
+
)
|
534
|
+
if minibatch
|
535
|
+
else num_trials
|
536
|
+
)
|
537
|
+
logger.info(
|
538
|
+
f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program =="
|
539
|
+
)
|
540
|
+
|
541
|
+
# default_score, _ = eval_candidate_program(
|
542
|
+
# len(valset), valset, program, evaluate, self.rng, return_all_scores=True
|
543
|
+
# )
|
544
|
+
|
545
|
+
examples = []
|
546
|
+
for demo in demo_candidates.values():
|
547
|
+
for l in demo:
|
548
|
+
for example in l:
|
549
|
+
examples.append(example.toDict())
|
550
|
+
prompt = program.signature.instructions
|
551
|
+
experiment_config = {
|
552
|
+
**self.experiment_config,
|
553
|
+
**{"configuration": {
|
554
|
+
"prompt": prompt,
|
555
|
+
"examples": examples,
|
556
|
+
},
|
557
|
+
"evaluation": "initial",
|
558
|
+
}
|
559
|
+
}
|
560
|
+
|
561
|
+
default_score = eval_candidate_program_with_opik(
|
562
|
+
opik_dataset=self.opik_dataset,
|
563
|
+
trainset=valset,
|
564
|
+
candidate_program=program,
|
565
|
+
metric_config=self.opik_metric_config,
|
566
|
+
prompt_task_config=self.opik_prompt_task_config,
|
567
|
+
project_name=self.opik_project_name,
|
568
|
+
num_threads=self.num_threads,
|
569
|
+
experiment_config=experiment_config,
|
570
|
+
optimization_id=self.opik_optimization_id,
|
571
|
+
)
|
572
|
+
|
573
|
+
logger.info(f"Default program score: {default_score}\n")
|
574
|
+
|
575
|
+
trial_logs = {}
|
576
|
+
trial_logs[1] = {}
|
577
|
+
trial_logs[1]["full_eval_program_path"] = save_candidate_program(
|
578
|
+
program, self.log_dir, -1
|
579
|
+
)
|
580
|
+
trial_logs[1]["full_eval_score"] = default_score
|
581
|
+
trial_logs[1]["total_eval_calls_so_far"] = len(valset)
|
582
|
+
trial_logs[1]["full_eval_program"] = program.deepcopy()
|
583
|
+
|
584
|
+
if default_score == 1.0:
|
585
|
+
return self.early_stop(default_score, program)
|
586
|
+
|
587
|
+
# Initialize optimization variables
|
588
|
+
best_score = default_score
|
589
|
+
best_program = program.deepcopy()
|
590
|
+
total_eval_calls = len(valset)
|
591
|
+
score_data = [
|
592
|
+
{"score": best_score, "program": program.deepcopy(), "full_eval": True}
|
593
|
+
]
|
594
|
+
param_score_dict = defaultdict(list)
|
595
|
+
fully_evaled_param_combos = {}
|
596
|
+
|
597
|
+
# Define the objective function
|
598
|
+
def objective(trial):
|
599
|
+
nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data
|
600
|
+
|
601
|
+
trial_num = trial.number + 1
|
602
|
+
if minibatch:
|
603
|
+
logger.info(
|
604
|
+
f"== Trial {trial_num} / {adjusted_num_trials} - Minibatch =="
|
605
|
+
)
|
606
|
+
else:
|
607
|
+
logger.info(f"===== Trial {trial_num} / {num_trials} =====")
|
608
|
+
|
609
|
+
trial_logs[trial_num] = {}
|
610
|
+
|
611
|
+
# Create a new candidate program
|
612
|
+
candidate_program = program.deepcopy()
|
613
|
+
|
614
|
+
# Choose instructions and demos, insert them into the program
|
615
|
+
chosen_params, raw_chosen_params = (
|
616
|
+
self._select_and_insert_instructions_and_demos(
|
617
|
+
candidate_program,
|
618
|
+
instruction_candidates,
|
619
|
+
demo_candidates,
|
620
|
+
trial,
|
621
|
+
trial_logs,
|
622
|
+
trial_num,
|
623
|
+
)
|
624
|
+
)
|
625
|
+
|
626
|
+
# Log assembled program
|
627
|
+
if self.verbose:
|
628
|
+
logger.info("Evaluating the following candidate program...\n")
|
629
|
+
print_full_program(candidate_program)
|
630
|
+
|
631
|
+
# Evaluate the candidate program (on minibatch if minibatch=True)
|
632
|
+
batch_size = minibatch_size if minibatch else len(valset)
|
633
|
+
score = eval_candidate_program(
|
634
|
+
batch_size, valset, candidate_program, evaluate, self.rng
|
635
|
+
)
|
636
|
+
# score = eval_candidate_program_with_opik(
|
637
|
+
# opik_dataset=self.opik_dataset,
|
638
|
+
# trainset=valset,
|
639
|
+
# candidate_program=candidate_program,
|
640
|
+
# metric_config=self.opik_metric_config,
|
641
|
+
# prompt_task_config=self.opik_prompt_task_config,
|
642
|
+
# project_name=self.opik_project_name,
|
643
|
+
# experiment_config=experiment_config,
|
644
|
+
# )
|
645
|
+
total_eval_calls += batch_size
|
646
|
+
|
647
|
+
# Update best score and program
|
648
|
+
if not minibatch and score > best_score:
|
649
|
+
best_score = score
|
650
|
+
best_program = candidate_program.deepcopy()
|
651
|
+
logger.info(f"{GREEN}Best full score so far!{ENDC} Score: {score}")
|
652
|
+
|
653
|
+
# Log evaluation results
|
654
|
+
score_data.append(
|
655
|
+
{
|
656
|
+
"score": score,
|
657
|
+
"program": candidate_program,
|
658
|
+
"full_eval": batch_size >= len(valset),
|
659
|
+
}
|
660
|
+
) # score, prog, full_eval
|
661
|
+
if minibatch:
|
662
|
+
self._log_minibatch_eval(
|
663
|
+
score,
|
664
|
+
best_score,
|
665
|
+
batch_size,
|
666
|
+
chosen_params,
|
667
|
+
score_data,
|
668
|
+
trial,
|
669
|
+
adjusted_num_trials,
|
670
|
+
trial_logs,
|
671
|
+
trial_num,
|
672
|
+
candidate_program,
|
673
|
+
total_eval_calls,
|
674
|
+
)
|
675
|
+
else:
|
676
|
+
self._log_normal_eval(
|
677
|
+
score,
|
678
|
+
best_score,
|
679
|
+
chosen_params,
|
680
|
+
score_data,
|
681
|
+
trial,
|
682
|
+
num_trials,
|
683
|
+
trial_logs,
|
684
|
+
trial_num,
|
685
|
+
valset,
|
686
|
+
batch_size,
|
687
|
+
candidate_program,
|
688
|
+
total_eval_calls,
|
689
|
+
)
|
690
|
+
categorical_key = ",".join(map(str, chosen_params))
|
691
|
+
param_score_dict[categorical_key].append(
|
692
|
+
(score, candidate_program, raw_chosen_params),
|
693
|
+
)
|
694
|
+
|
695
|
+
# If minibatch, perform full evaluation at intervals (and at the very end)
|
696
|
+
if minibatch and (
|
697
|
+
(trial_num % (minibatch_full_eval_steps + 1) == 0)
|
698
|
+
or (trial_num == (adjusted_num_trials - 1))
|
699
|
+
):
|
700
|
+
best_score, best_program, total_eval_calls = (
|
701
|
+
self._perform_full_evaluation(
|
702
|
+
trial_num,
|
703
|
+
adjusted_num_trials,
|
704
|
+
param_score_dict,
|
705
|
+
fully_evaled_param_combos,
|
706
|
+
evaluate,
|
707
|
+
valset,
|
708
|
+
trial_logs,
|
709
|
+
total_eval_calls,
|
710
|
+
score_data,
|
711
|
+
best_score,
|
712
|
+
best_program,
|
713
|
+
study,
|
714
|
+
instruction_candidates,
|
715
|
+
demo_candidates,
|
716
|
+
)
|
717
|
+
)
|
718
|
+
|
719
|
+
return score
|
720
|
+
|
721
|
+
sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
|
722
|
+
study = optuna.create_study(direction="maximize", sampler=sampler)
|
723
|
+
|
724
|
+
default_params = {
|
725
|
+
f"{i}_predictor_instruction": 0 for i in range(len(program.predictors()))
|
726
|
+
}
|
727
|
+
if demo_candidates:
|
728
|
+
default_params.update(
|
729
|
+
{f"{i}_predictor_demos": 0 for i in range(len(program.predictors()))}
|
730
|
+
)
|
731
|
+
|
732
|
+
# Add default run as a baseline in optuna (TODO: figure out how to weight this by # of samples evaluated on)
|
733
|
+
trial = optuna.trial.create_trial(
|
734
|
+
params=default_params,
|
735
|
+
distributions=self._get_param_distributions(
|
736
|
+
program, instruction_candidates, demo_candidates
|
737
|
+
),
|
738
|
+
value=default_score,
|
739
|
+
)
|
740
|
+
study.add_trial(trial)
|
741
|
+
study.optimize(objective, n_trials=num_trials)
|
742
|
+
|
743
|
+
# Attach logs to best program
|
744
|
+
if best_program is not None and self.track_stats:
|
745
|
+
best_program.trial_logs = trial_logs
|
746
|
+
best_program.score = best_score
|
747
|
+
best_program.prompt_model_total_calls = self.prompt_model_total_calls
|
748
|
+
best_program.total_calls = self.total_calls
|
749
|
+
sorted_candidate_programs = sorted(
|
750
|
+
score_data, key=lambda x: x["score"], reverse=True
|
751
|
+
)
|
752
|
+
# Attach all minibatch programs
|
753
|
+
best_program.mb_candidate_programs = [
|
754
|
+
score_data
|
755
|
+
for score_data in sorted_candidate_programs
|
756
|
+
if not score_data["full_eval"]
|
757
|
+
]
|
758
|
+
# Attach all programs that were evaluated on the full trainset, in descending order of score
|
759
|
+
best_program.candidate_programs = [
|
760
|
+
score_data
|
761
|
+
for score_data in sorted_candidate_programs
|
762
|
+
if score_data["full_eval"]
|
763
|
+
]
|
764
|
+
|
765
|
+
logger.info(f"Returning best identified program with score {best_score}!")
|
766
|
+
|
767
|
+
return best_program
|
768
|
+
|
769
|
+
def _log_minibatch_eval(
|
770
|
+
self,
|
771
|
+
score,
|
772
|
+
best_score,
|
773
|
+
batch_size,
|
774
|
+
chosen_params,
|
775
|
+
score_data,
|
776
|
+
trial,
|
777
|
+
adjusted_num_trials,
|
778
|
+
trial_logs,
|
779
|
+
trial_num,
|
780
|
+
candidate_program,
|
781
|
+
total_eval_calls,
|
782
|
+
):
|
783
|
+
trial_logs[trial_num]["mb_program_path"] = save_candidate_program(
|
784
|
+
candidate_program, self.log_dir, trial_num
|
785
|
+
)
|
786
|
+
trial_logs[trial_num]["mb_score"] = score
|
787
|
+
trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
|
788
|
+
trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
|
789
|
+
|
790
|
+
logger.info(
|
791
|
+
f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}."
|
792
|
+
)
|
793
|
+
minibatch_scores = ", ".join(
|
794
|
+
[f"{s['score']}" for s in score_data if not s["full_eval"]]
|
795
|
+
)
|
796
|
+
logger.info(f"Minibatch scores so far: {'[' + minibatch_scores + ']'}")
|
797
|
+
full_eval_scores = ", ".join(
|
798
|
+
[f"{s['score']}" for s in score_data if s["full_eval"]]
|
799
|
+
)
|
800
|
+
trajectory = "[" + full_eval_scores + "]"
|
801
|
+
logger.info(f"Full eval scores so far: {trajectory}")
|
802
|
+
logger.info(f"Best full score so far: {best_score}")
|
803
|
+
logger.info(
|
804
|
+
f"{'=' * len(f'== Trial {trial.number + 1} / {adjusted_num_trials} - Minibatch Evaluation ==')}\n\n"
|
805
|
+
)
|
806
|
+
|
807
|
+
def _log_normal_eval(
|
808
|
+
self,
|
809
|
+
score,
|
810
|
+
best_score,
|
811
|
+
chosen_params,
|
812
|
+
score_data,
|
813
|
+
trial,
|
814
|
+
num_trials,
|
815
|
+
trial_logs,
|
816
|
+
trial_num,
|
817
|
+
valset,
|
818
|
+
batch_size,
|
819
|
+
candidate_program,
|
820
|
+
total_eval_calls,
|
821
|
+
):
|
822
|
+
trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
|
823
|
+
candidate_program, self.log_dir, trial_num
|
824
|
+
)
|
825
|
+
trial_logs[trial_num]["full_eval_score"] = score
|
826
|
+
trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
|
827
|
+
trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy()
|
828
|
+
|
829
|
+
logger.info(f"Score: {score} with parameters {chosen_params}.")
|
830
|
+
full_eval_scores = ", ".join(
|
831
|
+
[f"{s['score']}" for s in score_data if s["full_eval"]]
|
832
|
+
)
|
833
|
+
logger.info(f"Scores so far: {'[' + full_eval_scores + ']'}")
|
834
|
+
logger.info(f"Best score so far: {best_score}")
|
835
|
+
logger.info(
|
836
|
+
f"{'=' * len(f'===== Trial {trial.number + 1} / {num_trials} =====')}\n\n"
|
837
|
+
)
|
838
|
+
|
839
|
+
def _select_and_insert_instructions_and_demos(
|
840
|
+
self,
|
841
|
+
candidate_program: Any,
|
842
|
+
instruction_candidates: Dict[int, List[str]],
|
843
|
+
demo_candidates: Optional[List],
|
844
|
+
trial: optuna.trial.Trial,
|
845
|
+
trial_logs: Dict,
|
846
|
+
trial_num: int,
|
847
|
+
) -> List[str]:
|
848
|
+
chosen_params = []
|
849
|
+
raw_chosen_params = {}
|
850
|
+
|
851
|
+
for i, predictor in enumerate(candidate_program.predictors()):
|
852
|
+
# Select instruction
|
853
|
+
instruction_idx = trial.suggest_categorical(
|
854
|
+
f"{i}_predictor_instruction", range(len(instruction_candidates[i]))
|
855
|
+
)
|
856
|
+
selected_instruction = instruction_candidates[i][instruction_idx]
|
857
|
+
updated_signature = get_signature(predictor).with_instructions(
|
858
|
+
selected_instruction
|
859
|
+
)
|
860
|
+
set_signature(predictor, updated_signature)
|
861
|
+
trial_logs[trial_num][f"{i}_predictor_instruction"] = instruction_idx
|
862
|
+
chosen_params.append(f"Predictor {i}: Instruction {instruction_idx}")
|
863
|
+
raw_chosen_params[f"{i}_predictor_instruction"] = instruction_idx
|
864
|
+
# Select demos if available
|
865
|
+
if demo_candidates:
|
866
|
+
demos_idx = trial.suggest_categorical(
|
867
|
+
f"{i}_predictor_demos", range(len(demo_candidates[i]))
|
868
|
+
)
|
869
|
+
predictor.demos = demo_candidates[i][demos_idx]
|
870
|
+
trial_logs[trial_num][f"{i}_predictor_demos"] = demos_idx
|
871
|
+
chosen_params.append(f"Predictor {i}: Few-Shot Set {demos_idx}")
|
872
|
+
raw_chosen_params[f"{i}_predictor_demos"] = instruction_idx
|
873
|
+
|
874
|
+
return chosen_params, raw_chosen_params
|
875
|
+
|
876
|
+
def _get_param_distributions(
|
877
|
+
self, program, instruction_candidates, demo_candidates
|
878
|
+
):
|
879
|
+
param_distributions = {}
|
880
|
+
|
881
|
+
for i in range(len(instruction_candidates)):
|
882
|
+
param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(
|
883
|
+
range(len(instruction_candidates[i]))
|
884
|
+
)
|
885
|
+
if demo_candidates:
|
886
|
+
param_distributions[f"{i}_predictor_demos"] = CategoricalDistribution(
|
887
|
+
range(len(demo_candidates[i]))
|
888
|
+
)
|
889
|
+
|
890
|
+
return param_distributions
|
891
|
+
|
892
|
+
def _perform_full_evaluation(
|
893
|
+
self,
|
894
|
+
trial_num: int,
|
895
|
+
adjusted_num_trials: int,
|
896
|
+
param_score_dict: Dict,
|
897
|
+
fully_evaled_param_combos: Dict,
|
898
|
+
evaluate: Evaluate,
|
899
|
+
valset: List,
|
900
|
+
trial_logs: Dict,
|
901
|
+
total_eval_calls: int,
|
902
|
+
score_data,
|
903
|
+
best_score: float,
|
904
|
+
best_program: Any,
|
905
|
+
study: optuna.Study,
|
906
|
+
instruction_candidates: List,
|
907
|
+
demo_candidates: List,
|
908
|
+
):
|
909
|
+
logger.info(
|
910
|
+
f"===== Trial {trial_num + 1} / {adjusted_num_trials} - Full Evaluation ====="
|
911
|
+
)
|
912
|
+
|
913
|
+
# Identify best program to evaluate fully
|
914
|
+
highest_mean_program, mean_score, combo_key, params = (
|
915
|
+
get_program_with_highest_avg_score(
|
916
|
+
param_score_dict, fully_evaled_param_combos
|
917
|
+
)
|
918
|
+
)
|
919
|
+
logger.info(
|
920
|
+
f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials..."
|
921
|
+
)
|
922
|
+
# full_eval_score_orig = eval_candidate_program(
|
923
|
+
# len(valset), valset, highest_mean_program, evaluate, self.rng
|
924
|
+
# )
|
925
|
+
|
926
|
+
examples = []
|
927
|
+
for demo in demo_candidates.values():
|
928
|
+
for l in demo:
|
929
|
+
for example in l:
|
930
|
+
examples.append(example.toDict())
|
931
|
+
prompt = highest_mean_program.signature.instructions
|
932
|
+
experiment_config = {
|
933
|
+
**self.experiment_config,
|
934
|
+
**{"configuration": {
|
935
|
+
"prompt": prompt,
|
936
|
+
"examples": examples,
|
937
|
+
},
|
938
|
+
"evaluation": "full",
|
939
|
+
}
|
940
|
+
}
|
941
|
+
|
942
|
+
full_eval_score = eval_candidate_program_with_opik(
|
943
|
+
opik_dataset=self.opik_dataset,
|
944
|
+
trainset=valset,
|
945
|
+
candidate_program=highest_mean_program,
|
946
|
+
metric_config=self.opik_metric_config,
|
947
|
+
prompt_task_config=self.opik_prompt_task_config,
|
948
|
+
project_name=self.opik_project_name,
|
949
|
+
num_threads=self.num_threads,
|
950
|
+
experiment_config=experiment_config,
|
951
|
+
optimization_id=self.opik_optimization_id,
|
952
|
+
)
|
953
|
+
score_data.append(
|
954
|
+
{
|
955
|
+
"score": full_eval_score,
|
956
|
+
"program": highest_mean_program,
|
957
|
+
"full_eval": True,
|
958
|
+
}
|
959
|
+
)
|
960
|
+
|
961
|
+
# Log full eval as a trial so that optuna can learn from the new results
|
962
|
+
trial = optuna.trial.create_trial(
|
963
|
+
params=params,
|
964
|
+
distributions=self._get_param_distributions(
|
965
|
+
best_program, instruction_candidates, demo_candidates
|
966
|
+
),
|
967
|
+
value=full_eval_score,
|
968
|
+
)
|
969
|
+
study.add_trial(trial)
|
970
|
+
|
971
|
+
# Log full evaluation results
|
972
|
+
fully_evaled_param_combos[combo_key] = {
|
973
|
+
"program": highest_mean_program,
|
974
|
+
"score": full_eval_score,
|
975
|
+
}
|
976
|
+
total_eval_calls += len(valset)
|
977
|
+
trial_logs[trial_num + 1] = {}
|
978
|
+
trial_logs[trial_num + 1]["total_eval_calls_so_far"] = total_eval_calls
|
979
|
+
trial_logs[trial_num + 1]["full_eval_program_path"] = save_candidate_program(
|
980
|
+
program=highest_mean_program,
|
981
|
+
log_dir=self.log_dir,
|
982
|
+
trial_num=trial_num + 1,
|
983
|
+
note="full_eval",
|
984
|
+
)
|
985
|
+
trial_logs[trial_num + 1]["full_eval_program"] = highest_mean_program
|
986
|
+
trial_logs[trial_num + 1]["full_eval_score"] = full_eval_score
|
987
|
+
|
988
|
+
if full_eval_score == 1.0:
|
989
|
+
return self.early_stop(default_score, program)
|
990
|
+
|
991
|
+
# Update best score and program if necessary
|
992
|
+
if full_eval_score > best_score:
|
993
|
+
logger.info(
|
994
|
+
f"{GREEN}New best full eval score!{ENDC} Score: {full_eval_score}"
|
995
|
+
)
|
996
|
+
best_score = full_eval_score
|
997
|
+
best_program = highest_mean_program.deepcopy()
|
998
|
+
full_eval_scores = ", ".join(
|
999
|
+
[f"{s['score']}" for s in score_data if s["full_eval"]]
|
1000
|
+
)
|
1001
|
+
trajectory = "[" + full_eval_scores + "]"
|
1002
|
+
logger.info(f"Full eval scores so far: {trajectory}")
|
1003
|
+
logger.info(f"Best full score so far: {best_score}")
|
1004
|
+
logger.info(
|
1005
|
+
len(f"===== Full Eval {len(fully_evaled_param_combos) + 1} =====") * "="
|
1006
|
+
)
|
1007
|
+
logger.info("\n")
|
1008
|
+
|
1009
|
+
return best_score, best_program, total_eval_calls
|
1010
|
+
|
1011
|
+
def early_stop(self, score, program):
|
1012
|
+
program.score = score
|
1013
|
+
program.candidate_programs = [{"score": score, "program": program.deepcopy()}]
|
1014
|
+
return program
|
1015
|
+
|
1016
|
+
|
1017
|
+
def eval_candidate_program_with_opik(
|
1018
|
+
opik_dataset: opik.Dataset,
|
1019
|
+
trainset: List,
|
1020
|
+
candidate_program: Any,
|
1021
|
+
project_name: str,
|
1022
|
+
metric_config: MetricConfig,
|
1023
|
+
prompt_task_config: TaskConfig,
|
1024
|
+
num_threads: int,
|
1025
|
+
experiment_config: Optional[Dict[str, Any]] = None,
|
1026
|
+
optimization_id: Optional[str] = None,
|
1027
|
+
):
|
1028
|
+
"""Evaluate a candidate program on the trainset, using the specified batch size."""
|
1029
|
+
dataset_item_ids = [example["id"] for example in trainset]
|
1030
|
+
|
1031
|
+
def program_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
|
1032
|
+
program_inputs = {
|
1033
|
+
input_key: dataset_item[input_key]
|
1034
|
+
for input_key in prompt_task_config.input_dataset_fields
|
1035
|
+
}
|
1036
|
+
prediction = candidate_program(**program_inputs)
|
1037
|
+
|
1038
|
+
# Increment assert and suggest failures to program's attributes
|
1039
|
+
if hasattr(candidate_program, "_assert_failures"):
|
1040
|
+
candidate_program._assert_failures += dspy.settings.get("assert_failures")
|
1041
|
+
if hasattr(candidate_program, "_suggest_failures"):
|
1042
|
+
candidate_program._suggest_failures += dspy.settings.get("suggest_failures")
|
1043
|
+
|
1044
|
+
return {mappers.from_llm_response_text(): prediction[prompt_task_config.output_dataset_field]}
|
1045
|
+
|
1046
|
+
|
1047
|
+
score = task_evaluator.evaluate(
|
1048
|
+
dataset=opik_dataset,
|
1049
|
+
evaluated_task=program_task,
|
1050
|
+
metric_config=metric_config,
|
1051
|
+
dataset_item_ids=dataset_item_ids,
|
1052
|
+
project_name=project_name,
|
1053
|
+
num_threads=num_threads,
|
1054
|
+
experiment_config=experiment_config,
|
1055
|
+
optimization_id=optimization_id,
|
1056
|
+
)
|
1057
|
+
|
1058
|
+
return score
|