opik-optimizer 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1058 @@
1
+ import logging
2
+ import random
3
+ import textwrap
4
+ from collections import defaultdict
5
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
6
+
7
+ import numpy as np
8
+ import opik
9
+ import optuna
10
+ from optuna.distributions import CategoricalDistribution
11
+
12
+ import dspy
13
+ from dspy.evaluate.evaluate import Evaluate
14
+ from dspy.propose import GroundedProposer
15
+ from dspy.teleprompt.teleprompt import Teleprompter
16
+ from dspy.teleprompt.utils import (
17
+ create_minibatch,
18
+ create_n_fewshot_demo_sets,
19
+ eval_candidate_program,
20
+ get_program_with_highest_avg_score,
21
+ get_signature,
22
+ print_full_program,
23
+ save_candidate_program,
24
+ set_signature,
25
+ )
26
+
27
+
28
+ class Logger():
29
+ def info(self, *args, **kwargs):
30
+ print(*args)
31
+
32
+ logger = Logger()
33
+
34
+ # Constants
35
+ BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT = 3
36
+ LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT = 0
37
+ MIN_MINIBATCH_SIZE = 50
38
+
39
+ AUTO_RUN_SETTINGS = {
40
+ "light": {"num_trials": 7, "val_size": 100},
41
+ "medium": {"num_trials": 25, "val_size": 300},
42
+ "heavy": {"num_trials": 50, "val_size": 1000},
43
+ }
44
+
45
+ # ANSI escape codes for colors
46
+ YELLOW = "\033[93m"
47
+ GREEN = "\033[92m"
48
+ BLUE = "\033[94m"
49
+ BOLD = "\033[1m"
50
+ ENDC = "\033[0m" # Resets the color to default
51
+
52
+ import opik
53
+ from opik_optimizer import task_evaluator
54
+ from opik_optimizer.optimization_config.configs import MetricConfig, TaskConfig
55
+ from opik_optimizer.optimization_config import mappers
56
+
57
+
58
+ class MIPROv2(Teleprompter):
59
+ def __init__(
60
+ self,
61
+ metric: Callable,
62
+ prompt_model: Optional[Any] = None,
63
+ task_model: Optional[Any] = None,
64
+ teacher_settings: Dict = {},
65
+ max_bootstrapped_demos: int = 4,
66
+ max_labeled_demos: int = 4,
67
+ auto: Optional[Literal["light", "medium", "heavy"]] = "medium",
68
+ num_candidates: int = 10,
69
+ num_threads: Optional[int] = None,
70
+ max_errors: int = 10,
71
+ seed: int = 9,
72
+ init_temperature: float = 0.5,
73
+ verbose: bool = False,
74
+ track_stats: bool = True,
75
+ log_dir: Optional[str] = None,
76
+ metric_threshold: Optional[float] = None,
77
+ opik_dataset: Optional[opik.Dataset] = None,
78
+ opik_metric_config: Optional[MetricConfig] = None,
79
+ opik_prompt_task_config: Optional[TaskConfig] = None,
80
+ opik_project_name: Optional[str] = None,
81
+ opik_optimization_id: Optional[str] = None,
82
+ experiment_config: Optional[Dict[str, Any]] = None,
83
+ ):
84
+ # Validate 'auto' parameter
85
+ allowed_modes = {None, "light", "medium", "heavy"}
86
+ if auto not in allowed_modes:
87
+ raise ValueError(
88
+ f"Invalid value for auto: {auto}. Must be one of {allowed_modes}."
89
+ )
90
+ self.auto = auto
91
+
92
+ self.num_candidates = num_candidates
93
+ self.metric = metric
94
+ self.init_temperature = init_temperature
95
+ self.task_model = task_model if task_model else dspy.settings.lm
96
+ self.prompt_model = prompt_model if prompt_model else dspy.settings.lm
97
+ self.max_bootstrapped_demos = max_bootstrapped_demos
98
+ self.max_labeled_demos = max_labeled_demos
99
+ self.verbose = verbose
100
+ self.track_stats = track_stats
101
+ self.log_dir = log_dir
102
+ self.teacher_settings = teacher_settings
103
+ self.prompt_model_total_calls = 0
104
+ self.total_calls = 0
105
+ self.num_threads = num_threads
106
+ self.max_errors = max_errors
107
+ self.metric_threshold = metric_threshold
108
+ self.seed = seed
109
+ self.rng = None
110
+
111
+ self.opik_dataset = opik_dataset
112
+ self.opik_metric_config = opik_metric_config
113
+ self.opik_prompt_task_config = opik_prompt_task_config
114
+ self.opik_project_name = opik_project_name
115
+ self.opik_optimization_id = opik_optimization_id
116
+ self.experiment_config = experiment_config or {}
117
+
118
+ def compile(
119
+ self,
120
+ student: Any,
121
+ *,
122
+ trainset: List,
123
+ teacher: Any = None,
124
+ valset: Optional[List] = None,
125
+ num_trials: int = 30,
126
+ max_bootstrapped_demos: Optional[int] = None,
127
+ max_labeled_demos: Optional[int] = None,
128
+ seed: Optional[int] = None,
129
+ minibatch: bool = True,
130
+ minibatch_size: int = 35,
131
+ minibatch_full_eval_steps: int = 5,
132
+ program_aware_proposer: bool = True,
133
+ data_aware_proposer: bool = True,
134
+ view_data_batch_size: int = 10,
135
+ tip_aware_proposer: bool = True,
136
+ fewshot_aware_proposer: bool = True,
137
+ requires_permission_to_run: bool = True,
138
+ provide_traceback: Optional[bool] = None,
139
+ ) -> Any:
140
+ # Set random seeds
141
+ seed = seed or self.seed
142
+ self._set_random_seeds(seed)
143
+
144
+ # Update max demos if specified
145
+ if max_bootstrapped_demos is not None:
146
+ self.max_bootstrapped_demos = max_bootstrapped_demos
147
+ if max_labeled_demos is not None:
148
+ self.max_labeled_demos = max_labeled_demos
149
+
150
+ # Set training & validation sets
151
+ trainset, valset = self._set_and_validate_datasets(trainset, valset)
152
+
153
+ # Set hyperparameters based on run mode (if set)
154
+ zeroshot_opt = (self.max_bootstrapped_demos == 0) and (
155
+ self.max_labeled_demos == 0
156
+ )
157
+ num_trials, valset, minibatch = self._set_hyperparams_from_run_mode(
158
+ student, num_trials, minibatch, zeroshot_opt, valset
159
+ )
160
+
161
+ if self.auto:
162
+ self._print_auto_run_settings(num_trials, minibatch, valset)
163
+
164
+ if minibatch and minibatch_size > len(valset):
165
+ raise ValueError(
166
+ f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}."
167
+ )
168
+
169
+ # Estimate LM calls and get user confirmation
170
+ if requires_permission_to_run:
171
+ if not self._get_user_confirmation(
172
+ student,
173
+ num_trials,
174
+ minibatch,
175
+ minibatch_size,
176
+ minibatch_full_eval_steps,
177
+ valset,
178
+ program_aware_proposer,
179
+ ):
180
+ logger.info("Compilation aborted by the user.")
181
+ return student # Return the original student program
182
+
183
+ # Initialize program and evaluator
184
+ program = student.deepcopy()
185
+ evaluate = Evaluate(
186
+ devset=valset,
187
+ metric=self.metric,
188
+ num_threads=self.num_threads,
189
+ max_errors=self.max_errors,
190
+ display_table=False,
191
+ display_progress=True,
192
+ provide_traceback=provide_traceback,
193
+ )
194
+
195
+ # Step 1: Bootstrap few-shot examples
196
+ demo_candidates = self._bootstrap_fewshot_examples(
197
+ program, trainset, seed, teacher
198
+ )
199
+
200
+ # Step 2: Propose instruction candidates
201
+ try:
202
+ instruction_candidates = self._propose_instructions(
203
+ program,
204
+ trainset,
205
+ demo_candidates,
206
+ view_data_batch_size,
207
+ program_aware_proposer,
208
+ data_aware_proposer,
209
+ tip_aware_proposer,
210
+ fewshot_aware_proposer,
211
+ )
212
+ except RuntimeError:
213
+ raise Exception("Make sure you have provider API key set") from None
214
+
215
+ # If zero-shot, discard demos
216
+ if zeroshot_opt:
217
+ demo_candidates = None
218
+
219
+ # Step 3: Find optimal prompt parameters
220
+ best_program = self._optimize_prompt_parameters(
221
+ program,
222
+ instruction_candidates,
223
+ demo_candidates,
224
+ evaluate,
225
+ valset,
226
+ num_trials,
227
+ minibatch,
228
+ minibatch_size,
229
+ minibatch_full_eval_steps,
230
+ seed,
231
+ )
232
+
233
+ return best_program
234
+
235
+ def _set_random_seeds(self, seed):
236
+ self.rng = random.Random(seed)
237
+ np.random.seed(seed)
238
+
239
+ def _set_hyperparams_from_run_mode(
240
+ self,
241
+ program: Any,
242
+ num_trials: int,
243
+ minibatch: bool,
244
+ zeroshot_opt: bool,
245
+ valset: List,
246
+ ) -> Tuple[int, List, bool]:
247
+ if self.auto is None:
248
+ return num_trials, valset, minibatch
249
+
250
+ num_vars = len(program.predictors())
251
+ if not zeroshot_opt:
252
+ num_vars *= 2 # Account for few-shot examples + instruction variables
253
+
254
+ auto_settings = AUTO_RUN_SETTINGS[self.auto]
255
+ num_trials = auto_settings["num_trials"]
256
+ valset = create_minibatch(
257
+ valset, batch_size=auto_settings["val_size"], rng=self.rng
258
+ )
259
+ minibatch = len(valset) > MIN_MINIBATCH_SIZE
260
+ self.num_candidates = int(
261
+ np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars]))
262
+ )
263
+
264
+ return num_trials, valset, minibatch
265
+
266
+ def _set_and_validate_datasets(self, trainset: List, valset: Optional[List]):
267
+ if not trainset:
268
+ raise ValueError("Trainset cannot be empty.")
269
+
270
+ if valset is None:
271
+ if len(trainset) < 2:
272
+ raise ValueError(
273
+ "Trainset must have at least 2 examples if no valset specified."
274
+ )
275
+ valset_size = min(1000, max(1, int(len(trainset) * 0.80)))
276
+ cutoff = len(trainset) - valset_size
277
+ valset = trainset[cutoff:]
278
+ trainset = trainset[:cutoff]
279
+ else:
280
+ if len(valset) < 1:
281
+ raise ValueError("Validation set must have at least 1 example.")
282
+
283
+ return trainset, valset
284
+
285
+ def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset: List):
286
+ logger.info(
287
+ f"\nRUNNING WITH THE FOLLOWING {self.auto.upper()} AUTO RUN SETTINGS:"
288
+ f"\nnum_trials: {num_trials}"
289
+ f"\nminibatch: {minibatch}"
290
+ f"\nnum_candidates: {self.num_candidates}"
291
+ f"\nvalset size: {len(valset)}\n"
292
+ )
293
+
294
+ def _estimate_lm_calls(
295
+ self,
296
+ program: Any,
297
+ num_trials: int,
298
+ minibatch: bool,
299
+ minibatch_size: int,
300
+ minibatch_full_eval_steps: int,
301
+ valset: List,
302
+ program_aware_proposer: bool,
303
+ ) -> Tuple[str, str]:
304
+ num_predictors = len(program.predictors())
305
+
306
+ # Estimate prompt model calls
307
+ estimated_prompt_model_calls = (
308
+ 10 # Data summarizer calls
309
+ + self.num_candidates * num_predictors # Candidate generation
310
+ + (
311
+ num_predictors + 1 if program_aware_proposer else 0
312
+ ) # Program-aware proposer
313
+ )
314
+ prompt_model_line = (
315
+ f"{YELLOW}- Prompt Generation: {BLUE}{BOLD}10{ENDC}{YELLOW} data summarizer calls + "
316
+ f"{BLUE}{BOLD}{self.num_candidates}{ENDC}{YELLOW} * "
317
+ f"{BLUE}{BOLD}{num_predictors}{ENDC}{YELLOW} lm calls in program "
318
+ f"+ ({BLUE}{BOLD}{num_predictors + 1}{ENDC}{YELLOW}) lm calls in program-aware proposer "
319
+ f"= {BLUE}{BOLD}{estimated_prompt_model_calls}{ENDC}{YELLOW} prompt model calls{ENDC}"
320
+ )
321
+
322
+ # Estimate task model calls
323
+ if not minibatch:
324
+ estimated_task_model_calls = len(valset) * num_trials
325
+ task_model_line = (
326
+ f"{YELLOW}- Program Evaluation: {BLUE}{BOLD}{len(valset)}{ENDC}{YELLOW} examples in val set * "
327
+ f"{BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} batches = "
328
+ f"{BLUE}{BOLD}{estimated_task_model_calls}{ENDC}{YELLOW} LM program calls{ENDC}"
329
+ )
330
+ else:
331
+ full_eval_steps = num_trials // minibatch_full_eval_steps + 1
332
+ estimated_task_model_calls = (
333
+ minibatch_size * num_trials + len(valset) * full_eval_steps
334
+ )
335
+ task_model_line = (
336
+ f"{YELLOW}- Program Evaluation: {BLUE}{BOLD}{minibatch_size}{ENDC}{YELLOW} examples in minibatch * "
337
+ f"{BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} batches + "
338
+ f"{BLUE}{BOLD}{len(valset)}{ENDC}{YELLOW} examples in val set * "
339
+ f"{BLUE}{BOLD}{full_eval_steps}{ENDC}{YELLOW} full evals = "
340
+ f"{BLUE}{BOLD}{estimated_task_model_calls}{ENDC}{YELLOW} LM Program calls{ENDC}"
341
+ )
342
+
343
+ return prompt_model_line, task_model_line
344
+
345
+ def _get_user_confirmation(
346
+ self,
347
+ program: Any,
348
+ num_trials: int,
349
+ minibatch: bool,
350
+ minibatch_size: int,
351
+ minibatch_full_eval_steps: int,
352
+ valset: List,
353
+ program_aware_proposer: bool,
354
+ ) -> bool:
355
+ prompt_model_line, task_model_line = self._estimate_lm_calls(
356
+ program,
357
+ num_trials,
358
+ minibatch,
359
+ minibatch_size,
360
+ minibatch_full_eval_steps,
361
+ valset,
362
+ program_aware_proposer,
363
+ )
364
+
365
+ user_message = textwrap.dedent(
366
+ f"""\
367
+ {YELLOW}{BOLD}Projected Language Model (LM) Calls{ENDC}
368
+
369
+ Based on the parameters you have set, the maximum number of LM calls is projected as follows:
370
+
371
+ {prompt_model_line}
372
+ {task_model_line}
373
+
374
+ {YELLOW}{BOLD}Estimated Cost Calculation:{ENDC}
375
+
376
+ {YELLOW}Total Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token)
377
+ + (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).{ENDC}
378
+
379
+ For a preliminary estimate of potential costs, we recommend you perform your own calculations based on the task
380
+ and prompt models you intend to use. If the projected costs exceed your budget or expectations, you may consider:
381
+
382
+ {YELLOW}- Reducing the number of trials (`num_trials`), the size of the valset, or the number of LM calls in your program.{ENDC}
383
+ {YELLOW}- Using a cheaper task model to optimize the prompt.{ENDC}
384
+ {YELLOW}- Setting `minibatch=True` if you haven't already.{ENDC}\n"""
385
+ )
386
+
387
+ user_confirmation_message = textwrap.dedent(
388
+ f"""\
389
+ To proceed with the execution of this program, please confirm by typing {BLUE}'y'{ENDC} for yes or {BLUE}'n'{ENDC} for no.
390
+
391
+ If you would like to bypass this confirmation step in future executions, set the {YELLOW}`requires_permission_to_run`{ENDC} flag to {YELLOW}`False`{ENDC} when calling compile.
392
+
393
+ {YELLOW}Awaiting your input...{ENDC}
394
+ """
395
+ )
396
+
397
+ user_input = (
398
+ input(
399
+ f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): "
400
+ )
401
+ .strip()
402
+ .lower()
403
+ )
404
+ return user_input == "y"
405
+
406
+ def _bootstrap_fewshot_examples(
407
+ self, program: Any, trainset: List, seed: int, teacher: Any
408
+ ) -> Optional[List]:
409
+ logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==")
410
+ if self.max_bootstrapped_demos > 0:
411
+ logger.info(
412
+ "These will be used as few-shot example candidates for our program and for creating instructions.\n"
413
+ )
414
+ else:
415
+ logger.info("These will be used for informing instruction proposal.\n")
416
+
417
+ logger.info(f"Bootstrapping N={self.num_candidates} sets of demonstrations...")
418
+
419
+ zeroshot = self.max_bootstrapped_demos == 0 and self.max_labeled_demos == 0
420
+
421
+ try:
422
+ demo_candidates = create_n_fewshot_demo_sets(
423
+ student=program,
424
+ num_candidate_sets=self.num_candidates,
425
+ trainset=trainset,
426
+ max_labeled_demos=(
427
+ LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT
428
+ if zeroshot
429
+ else self.max_labeled_demos
430
+ ),
431
+ max_bootstrapped_demos=(
432
+ BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT
433
+ if zeroshot
434
+ else self.max_bootstrapped_demos
435
+ ),
436
+ metric=self.metric,
437
+ max_errors=self.max_errors,
438
+ teacher=teacher,
439
+ teacher_settings=self.teacher_settings,
440
+ seed=seed,
441
+ metric_threshold=self.metric_threshold,
442
+ rng=self.rng,
443
+ )
444
+ except Exception as e:
445
+ logger.info(f"Error generating few-shot examples: {e}")
446
+ logger.info("Running without few-shot examples.")
447
+ demo_candidates = None
448
+
449
+ return demo_candidates
450
+
451
+ def _propose_instructions(
452
+ self,
453
+ program: Any,
454
+ trainset: List,
455
+ demo_candidates: Optional[List],
456
+ view_data_batch_size: int,
457
+ program_aware_proposer: bool,
458
+ data_aware_proposer: bool,
459
+ tip_aware_proposer: bool,
460
+ fewshot_aware_proposer: bool,
461
+ ) -> Dict[int, List[str]]:
462
+ logger.info("\n==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==")
463
+ logger.info(
464
+ "We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions."
465
+ )
466
+
467
+ proposer = GroundedProposer(
468
+ program=program,
469
+ trainset=trainset,
470
+ prompt_model=self.prompt_model,
471
+ view_data_batch_size=view_data_batch_size,
472
+ program_aware=program_aware_proposer,
473
+ use_dataset_summary=data_aware_proposer,
474
+ use_task_demos=fewshot_aware_proposer,
475
+ num_demos_in_context=BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT,
476
+ use_tip=tip_aware_proposer,
477
+ set_tip_randomly=tip_aware_proposer,
478
+ use_instruct_history=False,
479
+ set_history_randomly=False,
480
+ verbose=self.verbose,
481
+ rng=self.rng,
482
+ )
483
+
484
+ logger.info("\nProposing instructions...\n")
485
+ instruction_candidates = proposer.propose_instructions_for_program(
486
+ trainset=trainset,
487
+ program=program,
488
+ demo_candidates=demo_candidates,
489
+ N=self.num_candidates,
490
+ T=self.init_temperature,
491
+ trial_logs={},
492
+ )
493
+
494
+ for i, pred in enumerate(program.predictors()):
495
+ logger.info(f"Proposed Instructions for Predictor {i}:\n")
496
+ instruction_candidates[i][0] = get_signature(pred).instructions
497
+ for j, instruction in enumerate(instruction_candidates[i]):
498
+ logger.info(f"{j}: {instruction}\n")
499
+ logger.info("\n")
500
+
501
+ return instruction_candidates
502
+
503
+ def _optimize_prompt_parameters(
504
+ self,
505
+ program: Any,
506
+ instruction_candidates: Dict[int, List[str]],
507
+ demo_candidates: Optional[List],
508
+ evaluate: Evaluate,
509
+ valset: List,
510
+ num_trials: int,
511
+ minibatch: bool,
512
+ minibatch_size: int,
513
+ minibatch_full_eval_steps: int,
514
+ seed: int,
515
+ ) -> Optional[Any]:
516
+ # Run optimization
517
+ optuna.logging.set_verbosity(optuna.logging.WARNING)
518
+ logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
519
+ logger.info(
520
+ "We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n"
521
+ )
522
+
523
+ # Compute the adjusted total trials that we will run (including full evals)
524
+ run_additional_full_eval_at_end = (
525
+ 1 if num_trials % minibatch_full_eval_steps != 0 else 0
526
+ )
527
+ adjusted_num_trials = (
528
+ (
529
+ num_trials
530
+ + num_trials // minibatch_full_eval_steps
531
+ + 1
532
+ + run_additional_full_eval_at_end
533
+ )
534
+ if minibatch
535
+ else num_trials
536
+ )
537
+ logger.info(
538
+ f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program =="
539
+ )
540
+
541
+ # default_score, _ = eval_candidate_program(
542
+ # len(valset), valset, program, evaluate, self.rng, return_all_scores=True
543
+ # )
544
+
545
+ examples = []
546
+ for demo in demo_candidates.values():
547
+ for l in demo:
548
+ for example in l:
549
+ examples.append(example.toDict())
550
+ prompt = program.signature.instructions
551
+ experiment_config = {
552
+ **self.experiment_config,
553
+ **{"configuration": {
554
+ "prompt": prompt,
555
+ "examples": examples,
556
+ },
557
+ "evaluation": "initial",
558
+ }
559
+ }
560
+
561
+ default_score = eval_candidate_program_with_opik(
562
+ opik_dataset=self.opik_dataset,
563
+ trainset=valset,
564
+ candidate_program=program,
565
+ metric_config=self.opik_metric_config,
566
+ prompt_task_config=self.opik_prompt_task_config,
567
+ project_name=self.opik_project_name,
568
+ num_threads=self.num_threads,
569
+ experiment_config=experiment_config,
570
+ optimization_id=self.opik_optimization_id,
571
+ )
572
+
573
+ logger.info(f"Default program score: {default_score}\n")
574
+
575
+ trial_logs = {}
576
+ trial_logs[1] = {}
577
+ trial_logs[1]["full_eval_program_path"] = save_candidate_program(
578
+ program, self.log_dir, -1
579
+ )
580
+ trial_logs[1]["full_eval_score"] = default_score
581
+ trial_logs[1]["total_eval_calls_so_far"] = len(valset)
582
+ trial_logs[1]["full_eval_program"] = program.deepcopy()
583
+
584
+ if default_score == 1.0:
585
+ return self.early_stop(default_score, program)
586
+
587
+ # Initialize optimization variables
588
+ best_score = default_score
589
+ best_program = program.deepcopy()
590
+ total_eval_calls = len(valset)
591
+ score_data = [
592
+ {"score": best_score, "program": program.deepcopy(), "full_eval": True}
593
+ ]
594
+ param_score_dict = defaultdict(list)
595
+ fully_evaled_param_combos = {}
596
+
597
+ # Define the objective function
598
+ def objective(trial):
599
+ nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data
600
+
601
+ trial_num = trial.number + 1
602
+ if minibatch:
603
+ logger.info(
604
+ f"== Trial {trial_num} / {adjusted_num_trials} - Minibatch =="
605
+ )
606
+ else:
607
+ logger.info(f"===== Trial {trial_num} / {num_trials} =====")
608
+
609
+ trial_logs[trial_num] = {}
610
+
611
+ # Create a new candidate program
612
+ candidate_program = program.deepcopy()
613
+
614
+ # Choose instructions and demos, insert them into the program
615
+ chosen_params, raw_chosen_params = (
616
+ self._select_and_insert_instructions_and_demos(
617
+ candidate_program,
618
+ instruction_candidates,
619
+ demo_candidates,
620
+ trial,
621
+ trial_logs,
622
+ trial_num,
623
+ )
624
+ )
625
+
626
+ # Log assembled program
627
+ if self.verbose:
628
+ logger.info("Evaluating the following candidate program...\n")
629
+ print_full_program(candidate_program)
630
+
631
+ # Evaluate the candidate program (on minibatch if minibatch=True)
632
+ batch_size = minibatch_size if minibatch else len(valset)
633
+ score = eval_candidate_program(
634
+ batch_size, valset, candidate_program, evaluate, self.rng
635
+ )
636
+ # score = eval_candidate_program_with_opik(
637
+ # opik_dataset=self.opik_dataset,
638
+ # trainset=valset,
639
+ # candidate_program=candidate_program,
640
+ # metric_config=self.opik_metric_config,
641
+ # prompt_task_config=self.opik_prompt_task_config,
642
+ # project_name=self.opik_project_name,
643
+ # experiment_config=experiment_config,
644
+ # )
645
+ total_eval_calls += batch_size
646
+
647
+ # Update best score and program
648
+ if not minibatch and score > best_score:
649
+ best_score = score
650
+ best_program = candidate_program.deepcopy()
651
+ logger.info(f"{GREEN}Best full score so far!{ENDC} Score: {score}")
652
+
653
+ # Log evaluation results
654
+ score_data.append(
655
+ {
656
+ "score": score,
657
+ "program": candidate_program,
658
+ "full_eval": batch_size >= len(valset),
659
+ }
660
+ ) # score, prog, full_eval
661
+ if minibatch:
662
+ self._log_minibatch_eval(
663
+ score,
664
+ best_score,
665
+ batch_size,
666
+ chosen_params,
667
+ score_data,
668
+ trial,
669
+ adjusted_num_trials,
670
+ trial_logs,
671
+ trial_num,
672
+ candidate_program,
673
+ total_eval_calls,
674
+ )
675
+ else:
676
+ self._log_normal_eval(
677
+ score,
678
+ best_score,
679
+ chosen_params,
680
+ score_data,
681
+ trial,
682
+ num_trials,
683
+ trial_logs,
684
+ trial_num,
685
+ valset,
686
+ batch_size,
687
+ candidate_program,
688
+ total_eval_calls,
689
+ )
690
+ categorical_key = ",".join(map(str, chosen_params))
691
+ param_score_dict[categorical_key].append(
692
+ (score, candidate_program, raw_chosen_params),
693
+ )
694
+
695
+ # If minibatch, perform full evaluation at intervals (and at the very end)
696
+ if minibatch and (
697
+ (trial_num % (minibatch_full_eval_steps + 1) == 0)
698
+ or (trial_num == (adjusted_num_trials - 1))
699
+ ):
700
+ best_score, best_program, total_eval_calls = (
701
+ self._perform_full_evaluation(
702
+ trial_num,
703
+ adjusted_num_trials,
704
+ param_score_dict,
705
+ fully_evaled_param_combos,
706
+ evaluate,
707
+ valset,
708
+ trial_logs,
709
+ total_eval_calls,
710
+ score_data,
711
+ best_score,
712
+ best_program,
713
+ study,
714
+ instruction_candidates,
715
+ demo_candidates,
716
+ )
717
+ )
718
+
719
+ return score
720
+
721
+ sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
722
+ study = optuna.create_study(direction="maximize", sampler=sampler)
723
+
724
+ default_params = {
725
+ f"{i}_predictor_instruction": 0 for i in range(len(program.predictors()))
726
+ }
727
+ if demo_candidates:
728
+ default_params.update(
729
+ {f"{i}_predictor_demos": 0 for i in range(len(program.predictors()))}
730
+ )
731
+
732
+ # Add default run as a baseline in optuna (TODO: figure out how to weight this by # of samples evaluated on)
733
+ trial = optuna.trial.create_trial(
734
+ params=default_params,
735
+ distributions=self._get_param_distributions(
736
+ program, instruction_candidates, demo_candidates
737
+ ),
738
+ value=default_score,
739
+ )
740
+ study.add_trial(trial)
741
+ study.optimize(objective, n_trials=num_trials)
742
+
743
+ # Attach logs to best program
744
+ if best_program is not None and self.track_stats:
745
+ best_program.trial_logs = trial_logs
746
+ best_program.score = best_score
747
+ best_program.prompt_model_total_calls = self.prompt_model_total_calls
748
+ best_program.total_calls = self.total_calls
749
+ sorted_candidate_programs = sorted(
750
+ score_data, key=lambda x: x["score"], reverse=True
751
+ )
752
+ # Attach all minibatch programs
753
+ best_program.mb_candidate_programs = [
754
+ score_data
755
+ for score_data in sorted_candidate_programs
756
+ if not score_data["full_eval"]
757
+ ]
758
+ # Attach all programs that were evaluated on the full trainset, in descending order of score
759
+ best_program.candidate_programs = [
760
+ score_data
761
+ for score_data in sorted_candidate_programs
762
+ if score_data["full_eval"]
763
+ ]
764
+
765
+ logger.info(f"Returning best identified program with score {best_score}!")
766
+
767
+ return best_program
768
+
769
+ def _log_minibatch_eval(
770
+ self,
771
+ score,
772
+ best_score,
773
+ batch_size,
774
+ chosen_params,
775
+ score_data,
776
+ trial,
777
+ adjusted_num_trials,
778
+ trial_logs,
779
+ trial_num,
780
+ candidate_program,
781
+ total_eval_calls,
782
+ ):
783
+ trial_logs[trial_num]["mb_program_path"] = save_candidate_program(
784
+ candidate_program, self.log_dir, trial_num
785
+ )
786
+ trial_logs[trial_num]["mb_score"] = score
787
+ trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
788
+ trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
789
+
790
+ logger.info(
791
+ f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}."
792
+ )
793
+ minibatch_scores = ", ".join(
794
+ [f"{s['score']}" for s in score_data if not s["full_eval"]]
795
+ )
796
+ logger.info(f"Minibatch scores so far: {'[' + minibatch_scores + ']'}")
797
+ full_eval_scores = ", ".join(
798
+ [f"{s['score']}" for s in score_data if s["full_eval"]]
799
+ )
800
+ trajectory = "[" + full_eval_scores + "]"
801
+ logger.info(f"Full eval scores so far: {trajectory}")
802
+ logger.info(f"Best full score so far: {best_score}")
803
+ logger.info(
804
+ f"{'=' * len(f'== Trial {trial.number + 1} / {adjusted_num_trials} - Minibatch Evaluation ==')}\n\n"
805
+ )
806
+
807
+ def _log_normal_eval(
808
+ self,
809
+ score,
810
+ best_score,
811
+ chosen_params,
812
+ score_data,
813
+ trial,
814
+ num_trials,
815
+ trial_logs,
816
+ trial_num,
817
+ valset,
818
+ batch_size,
819
+ candidate_program,
820
+ total_eval_calls,
821
+ ):
822
+ trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
823
+ candidate_program, self.log_dir, trial_num
824
+ )
825
+ trial_logs[trial_num]["full_eval_score"] = score
826
+ trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
827
+ trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy()
828
+
829
+ logger.info(f"Score: {score} with parameters {chosen_params}.")
830
+ full_eval_scores = ", ".join(
831
+ [f"{s['score']}" for s in score_data if s["full_eval"]]
832
+ )
833
+ logger.info(f"Scores so far: {'[' + full_eval_scores + ']'}")
834
+ logger.info(f"Best score so far: {best_score}")
835
+ logger.info(
836
+ f"{'=' * len(f'===== Trial {trial.number + 1} / {num_trials} =====')}\n\n"
837
+ )
838
+
839
+ def _select_and_insert_instructions_and_demos(
840
+ self,
841
+ candidate_program: Any,
842
+ instruction_candidates: Dict[int, List[str]],
843
+ demo_candidates: Optional[List],
844
+ trial: optuna.trial.Trial,
845
+ trial_logs: Dict,
846
+ trial_num: int,
847
+ ) -> List[str]:
848
+ chosen_params = []
849
+ raw_chosen_params = {}
850
+
851
+ for i, predictor in enumerate(candidate_program.predictors()):
852
+ # Select instruction
853
+ instruction_idx = trial.suggest_categorical(
854
+ f"{i}_predictor_instruction", range(len(instruction_candidates[i]))
855
+ )
856
+ selected_instruction = instruction_candidates[i][instruction_idx]
857
+ updated_signature = get_signature(predictor).with_instructions(
858
+ selected_instruction
859
+ )
860
+ set_signature(predictor, updated_signature)
861
+ trial_logs[trial_num][f"{i}_predictor_instruction"] = instruction_idx
862
+ chosen_params.append(f"Predictor {i}: Instruction {instruction_idx}")
863
+ raw_chosen_params[f"{i}_predictor_instruction"] = instruction_idx
864
+ # Select demos if available
865
+ if demo_candidates:
866
+ demos_idx = trial.suggest_categorical(
867
+ f"{i}_predictor_demos", range(len(demo_candidates[i]))
868
+ )
869
+ predictor.demos = demo_candidates[i][demos_idx]
870
+ trial_logs[trial_num][f"{i}_predictor_demos"] = demos_idx
871
+ chosen_params.append(f"Predictor {i}: Few-Shot Set {demos_idx}")
872
+ raw_chosen_params[f"{i}_predictor_demos"] = instruction_idx
873
+
874
+ return chosen_params, raw_chosen_params
875
+
876
+ def _get_param_distributions(
877
+ self, program, instruction_candidates, demo_candidates
878
+ ):
879
+ param_distributions = {}
880
+
881
+ for i in range(len(instruction_candidates)):
882
+ param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(
883
+ range(len(instruction_candidates[i]))
884
+ )
885
+ if demo_candidates:
886
+ param_distributions[f"{i}_predictor_demos"] = CategoricalDistribution(
887
+ range(len(demo_candidates[i]))
888
+ )
889
+
890
+ return param_distributions
891
+
892
+ def _perform_full_evaluation(
893
+ self,
894
+ trial_num: int,
895
+ adjusted_num_trials: int,
896
+ param_score_dict: Dict,
897
+ fully_evaled_param_combos: Dict,
898
+ evaluate: Evaluate,
899
+ valset: List,
900
+ trial_logs: Dict,
901
+ total_eval_calls: int,
902
+ score_data,
903
+ best_score: float,
904
+ best_program: Any,
905
+ study: optuna.Study,
906
+ instruction_candidates: List,
907
+ demo_candidates: List,
908
+ ):
909
+ logger.info(
910
+ f"===== Trial {trial_num + 1} / {adjusted_num_trials} - Full Evaluation ====="
911
+ )
912
+
913
+ # Identify best program to evaluate fully
914
+ highest_mean_program, mean_score, combo_key, params = (
915
+ get_program_with_highest_avg_score(
916
+ param_score_dict, fully_evaled_param_combos
917
+ )
918
+ )
919
+ logger.info(
920
+ f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials..."
921
+ )
922
+ # full_eval_score_orig = eval_candidate_program(
923
+ # len(valset), valset, highest_mean_program, evaluate, self.rng
924
+ # )
925
+
926
+ examples = []
927
+ for demo in demo_candidates.values():
928
+ for l in demo:
929
+ for example in l:
930
+ examples.append(example.toDict())
931
+ prompt = highest_mean_program.signature.instructions
932
+ experiment_config = {
933
+ **self.experiment_config,
934
+ **{"configuration": {
935
+ "prompt": prompt,
936
+ "examples": examples,
937
+ },
938
+ "evaluation": "full",
939
+ }
940
+ }
941
+
942
+ full_eval_score = eval_candidate_program_with_opik(
943
+ opik_dataset=self.opik_dataset,
944
+ trainset=valset,
945
+ candidate_program=highest_mean_program,
946
+ metric_config=self.opik_metric_config,
947
+ prompt_task_config=self.opik_prompt_task_config,
948
+ project_name=self.opik_project_name,
949
+ num_threads=self.num_threads,
950
+ experiment_config=experiment_config,
951
+ optimization_id=self.opik_optimization_id,
952
+ )
953
+ score_data.append(
954
+ {
955
+ "score": full_eval_score,
956
+ "program": highest_mean_program,
957
+ "full_eval": True,
958
+ }
959
+ )
960
+
961
+ # Log full eval as a trial so that optuna can learn from the new results
962
+ trial = optuna.trial.create_trial(
963
+ params=params,
964
+ distributions=self._get_param_distributions(
965
+ best_program, instruction_candidates, demo_candidates
966
+ ),
967
+ value=full_eval_score,
968
+ )
969
+ study.add_trial(trial)
970
+
971
+ # Log full evaluation results
972
+ fully_evaled_param_combos[combo_key] = {
973
+ "program": highest_mean_program,
974
+ "score": full_eval_score,
975
+ }
976
+ total_eval_calls += len(valset)
977
+ trial_logs[trial_num + 1] = {}
978
+ trial_logs[trial_num + 1]["total_eval_calls_so_far"] = total_eval_calls
979
+ trial_logs[trial_num + 1]["full_eval_program_path"] = save_candidate_program(
980
+ program=highest_mean_program,
981
+ log_dir=self.log_dir,
982
+ trial_num=trial_num + 1,
983
+ note="full_eval",
984
+ )
985
+ trial_logs[trial_num + 1]["full_eval_program"] = highest_mean_program
986
+ trial_logs[trial_num + 1]["full_eval_score"] = full_eval_score
987
+
988
+ if full_eval_score == 1.0:
989
+ return self.early_stop(default_score, program)
990
+
991
+ # Update best score and program if necessary
992
+ if full_eval_score > best_score:
993
+ logger.info(
994
+ f"{GREEN}New best full eval score!{ENDC} Score: {full_eval_score}"
995
+ )
996
+ best_score = full_eval_score
997
+ best_program = highest_mean_program.deepcopy()
998
+ full_eval_scores = ", ".join(
999
+ [f"{s['score']}" for s in score_data if s["full_eval"]]
1000
+ )
1001
+ trajectory = "[" + full_eval_scores + "]"
1002
+ logger.info(f"Full eval scores so far: {trajectory}")
1003
+ logger.info(f"Best full score so far: {best_score}")
1004
+ logger.info(
1005
+ len(f"===== Full Eval {len(fully_evaled_param_combos) + 1} =====") * "="
1006
+ )
1007
+ logger.info("\n")
1008
+
1009
+ return best_score, best_program, total_eval_calls
1010
+
1011
+ def early_stop(self, score, program):
1012
+ program.score = score
1013
+ program.candidate_programs = [{"score": score, "program": program.deepcopy()}]
1014
+ return program
1015
+
1016
+
1017
+ def eval_candidate_program_with_opik(
1018
+ opik_dataset: opik.Dataset,
1019
+ trainset: List,
1020
+ candidate_program: Any,
1021
+ project_name: str,
1022
+ metric_config: MetricConfig,
1023
+ prompt_task_config: TaskConfig,
1024
+ num_threads: int,
1025
+ experiment_config: Optional[Dict[str, Any]] = None,
1026
+ optimization_id: Optional[str] = None,
1027
+ ):
1028
+ """Evaluate a candidate program on the trainset, using the specified batch size."""
1029
+ dataset_item_ids = [example["id"] for example in trainset]
1030
+
1031
+ def program_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
1032
+ program_inputs = {
1033
+ input_key: dataset_item[input_key]
1034
+ for input_key in prompt_task_config.input_dataset_fields
1035
+ }
1036
+ prediction = candidate_program(**program_inputs)
1037
+
1038
+ # Increment assert and suggest failures to program's attributes
1039
+ if hasattr(candidate_program, "_assert_failures"):
1040
+ candidate_program._assert_failures += dspy.settings.get("assert_failures")
1041
+ if hasattr(candidate_program, "_suggest_failures"):
1042
+ candidate_program._suggest_failures += dspy.settings.get("suggest_failures")
1043
+
1044
+ return {mappers.from_llm_response_text(): prediction[prompt_task_config.output_dataset_field]}
1045
+
1046
+
1047
+ score = task_evaluator.evaluate(
1048
+ dataset=opik_dataset,
1049
+ evaluated_task=program_task,
1050
+ metric_config=metric_config,
1051
+ dataset_item_ids=dataset_item_ids,
1052
+ project_name=project_name,
1053
+ num_threads=num_threads,
1054
+ experiment_config=experiment_config,
1055
+ optimization_id=optimization_id,
1056
+ )
1057
+
1058
+ return score