opik-optimizer 0.7.7__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/base_optimizer.py +6 -4
  3. opik_optimizer/datasets/__init__.py +27 -0
  4. opik_optimizer/datasets/ai2_arc.py +44 -0
  5. opik_optimizer/datasets/cnn_dailymail.py +40 -0
  6. opik_optimizer/datasets/election_questions.py +36 -0
  7. opik_optimizer/datasets/gsm8k.py +40 -0
  8. opik_optimizer/datasets/halu_eval.py +43 -0
  9. opik_optimizer/datasets/hotpot_qa.py +67 -0
  10. opik_optimizer/datasets/medhallu.py +39 -0
  11. opik_optimizer/datasets/rag_hallucinations.py +41 -0
  12. opik_optimizer/datasets/ragbench.py +40 -0
  13. opik_optimizer/datasets/tiny_test.py +57 -0
  14. opik_optimizer/datasets/truthful_qa.py +107 -0
  15. opik_optimizer/demo/datasets.py +53 -607
  16. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +3 -1
  17. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +88 -17
  18. opik_optimizer/logging_config.py +1 -1
  19. opik_optimizer/meta_prompt_optimizer.py +57 -11
  20. opik_optimizer/mipro_optimizer/mipro_optimizer.py +164 -16
  21. opik_optimizer/mipro_optimizer/utils.py +8 -1
  22. opik_optimizer/optimization_result.py +11 -0
  23. opik_optimizer/task_evaluator.py +6 -1
  24. opik_optimizer/utils.py +0 -52
  25. opik_optimizer-0.8.0.dist-info/METADATA +196 -0
  26. opik_optimizer-0.8.0.dist-info/RECORD +45 -0
  27. opik_optimizer-0.7.7.dist-info/METADATA +0 -174
  28. opik_optimizer-0.7.7.dist-info/RECORD +0 -33
  29. {opik_optimizer-0.7.7.dist-info → opik_optimizer-0.8.0.dist-info}/WHEEL +0 -0
  30. {opik_optimizer-0.7.7.dist-info → opik_optimizer-0.8.0.dist-info}/licenses/LICENSE +0 -0
  31. {opik_optimizer-0.7.7.dist-info → opik_optimizer-0.8.0.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import optuna
5
5
  import optuna.samplers
6
6
  import logging
7
7
  import json
8
+ from datetime import datetime
8
9
 
9
10
  from opik import Dataset
10
11
  from opik_optimizer.optimization_config import mappers
@@ -26,20 +27,6 @@ _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
26
27
  logger = logging.getLogger(__name__)
27
28
 
28
29
 
29
- @_throttle.rate_limited(_limiter)
30
- def _call_model(model, messages, seed, model_kwargs):
31
- model_kwargs = opik_litellm_monitor.try_add_opik_monitoring_to_params(model_kwargs)
32
-
33
- response = litellm.completion(
34
- model=model,
35
- messages=messages,
36
- seed=seed,
37
- **model_kwargs,
38
- )
39
-
40
- return response
41
-
42
-
43
30
  class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
44
31
  def __init__(
45
32
  self,
@@ -51,6 +38,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
51
38
  n_threads: int = 8,
52
39
  n_initial_prompts: int = 5,
53
40
  n_iterations: int = 10,
41
+ verbose: int = 1,
54
42
  **model_kwargs,
55
43
  ) -> None:
56
44
  super().__init__(model, project_name, **model_kwargs)
@@ -60,9 +48,37 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
60
48
  self.n_threads = n_threads
61
49
  self.n_initial_prompts = n_initial_prompts
62
50
  self.n_iterations = n_iterations
51
+ self.verbose = verbose
63
52
  self._opik_client = opik.Opik()
53
+ self.llm_call_counter = 0
64
54
  logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
65
55
 
56
+ @_throttle.rate_limited(_limiter)
57
+ def _call_model(self, model, messages, seed, model_kwargs):
58
+ self.llm_call_counter += 1
59
+
60
+ current_model_kwargs = self.model_kwargs.copy()
61
+ current_model_kwargs.update(model_kwargs)
62
+
63
+ filtered_call_kwargs = current_model_kwargs.copy()
64
+ filtered_call_kwargs.pop('n_trials', None)
65
+ filtered_call_kwargs.pop('n_samples', None)
66
+ filtered_call_kwargs.pop('n_iterations', None)
67
+ filtered_call_kwargs.pop('min_examples', None)
68
+ filtered_call_kwargs.pop('max_examples', None)
69
+ filtered_call_kwargs.pop('n_initial_prompts', None)
70
+
71
+ final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
72
+
73
+ response = litellm.completion(
74
+ model=self.model,
75
+ messages=messages,
76
+ seed=seed,
77
+ num_retries=6,
78
+ **final_params_for_litellm,
79
+ )
80
+ return response
81
+
66
82
  def _split_dataset(
67
83
  self, dataset: List[Dict[str, Any]], train_ratio: float
68
84
  ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
@@ -96,6 +112,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
96
112
  n_samples: int = None,
97
113
  ) -> optimization_result.OptimizationResult:
98
114
  random.seed(self.seed)
115
+ self.llm_call_counter = 0
99
116
 
100
117
  if not task_config.use_chat_prompt:
101
118
  raise ValueError(
@@ -161,6 +178,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
161
178
  project_name=self.project_name,
162
179
  experiment_config=initial_eval_config,
163
180
  optimization_id=optimization_id,
181
+ verbose=self.verbose,
164
182
  )
165
183
  logger.info(f"Initial (zero-shot) score: {initial_score:.4f}")
166
184
 
@@ -222,6 +240,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
222
240
  project_name=self.project_name,
223
241
  experiment_config=trial_config,
224
242
  optimization_id=optimization_id,
243
+ verbose=self.verbose,
225
244
  )
226
245
  logger.debug(f"Trial {trial.number} score: {score:.4f}")
227
246
 
@@ -242,11 +261,59 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
242
261
  except Exception as e:
243
262
  logger.warning(f"Could not configure Optuna logging within optimizer: {e}")
244
263
 
264
+ # Explicitly create and seed the sampler for Optuna
245
265
  sampler = optuna.samplers.TPESampler(seed=self.seed)
246
266
  study = optuna.create_study(direction="maximize", sampler=sampler)
247
- study.optimize(optimization_objective, n_trials=n_trials)
267
+
268
+ study.optimize(optimization_objective, n_trials=n_trials, show_progress_bar=(self.verbose >= 1))
248
269
  logger.info("Optuna study finished.")
249
270
 
271
+ optuna_history_processed = []
272
+ for trial_idx, trial in enumerate(study.trials):
273
+ if trial.state == optuna.trial.TrialState.COMPLETE:
274
+ param_obj: Optional[prompt_parameter.ChatPromptParameter] = trial.user_attrs.get("param")
275
+ prompt_cand_display = None # Default to None
276
+ if param_obj and hasattr(param_obj, 'as_template') and callable(param_obj.as_template):
277
+ try:
278
+ # .format() on ChatPromptTemplate returns the list of messages
279
+ chat_messages_for_history = param_obj.as_template().format()
280
+ prompt_cand_display = json.dumps(chat_messages_for_history)
281
+ except Exception as e_param_format:
282
+ logger.warning(f"Trial {trial.number}: Error formatting prompt from param_obj: {e_param_format}")
283
+ prompt_cand_display = "Error: Could not format prompt content."
284
+ elif not param_obj:
285
+ logger.warning(f"Trial {trial.number}: 'param' object not found in user_attrs.")
286
+ prompt_cand_display = "Error: Prompt data missing in trial."
287
+ else:
288
+ logger.warning(f"Trial {trial.number}: 'param' object is not of expected type or lacks methods.")
289
+ prompt_cand_display = "Error: Invalid prompt data structure in trial."
290
+
291
+ score_val = trial.value # This can be None if trial failed to produce a score
292
+ duration_val = None
293
+ if trial.datetime_complete and trial.datetime_start:
294
+ duration_val = (trial.datetime_complete - trial.datetime_start).total_seconds()
295
+
296
+ iter_detail = {
297
+ "iteration": trial.number + 1,
298
+ "timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
299
+ "prompt_candidate": prompt_cand_display,
300
+ "parameters_used": {
301
+ "optuna_params": trial.params,
302
+ "example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
303
+ },
304
+ "scores": [{
305
+ "metric_name": metric_config.metric.name,
306
+ "score": score_val, # Can be None
307
+ "opik_evaluation_id": None # TODO
308
+ }],
309
+ "tokens_used": None, # TODO
310
+ "cost": None, # TODO
311
+ "duration_seconds": duration_val,
312
+ }
313
+ optuna_history_processed.append(iter_detail)
314
+ else:
315
+ logger.warning(f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}")
316
+
250
317
  best_trial = study.best_trial
251
318
  best_score = best_trial.value
252
319
  best_n_examples = best_trial.params["n_examples"]
@@ -279,6 +346,8 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
279
346
  "model": self.model,
280
347
  "temperature": self.model_kwargs.get("temperature"),
281
348
  },
349
+ history=optuna_history_processed,
350
+ llm_calls=self.llm_call_counter
282
351
  )
283
352
 
284
353
  def optimize_prompt(
@@ -295,6 +364,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
295
364
  optimization = self._opik_client.create_optimization(
296
365
  dataset_name=dataset.name,
297
366
  objective_name=metric_config.metric.name,
367
+ metadata={"optimizer": self.__class__.__name__},
298
368
  )
299
369
  except Exception:
300
370
  logger.warning(
@@ -389,6 +459,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
389
459
  num_threads=self.n_threads,
390
460
  project_name=self.project_name,
391
461
  experiment_config=experiment_config,
462
+ verbose=self.verbose,
392
463
  )
393
464
  logger.debug(f"Evaluation score: {score:.4f}")
394
465
 
@@ -400,11 +471,11 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
400
471
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
401
472
  prompt_ = template.format(**dataset_item)
402
473
 
403
- response = _call_model(
474
+ response = self._call_model(
404
475
  model=self.model,
405
476
  messages=prompt_,
406
477
  seed=self.seed,
407
- model_kwargs=self.model_kwargs,
478
+ model_kwargs=self.model_kwargs
408
479
  )
409
480
 
410
481
  return {
@@ -63,7 +63,7 @@ def setup_logging(
63
63
  _logging_configured = True
64
64
 
65
65
  # Use level name provided by rich handler by default
66
- package_logger.info(f"Opik Optimizer logging configured to level: [bold cyan]{logging.getLevelName(level)}[/bold cyan]")
66
+ package_logger.info(f"Opik Agent Optimizer logging configured to level: [bold cyan]{logging.getLevelName(level)}[/bold cyan]")
67
67
 
68
68
  # Ensure logger obtained after setup can be used immediately if needed
69
69
  logger = logging.getLogger(__name__)
@@ -101,6 +101,8 @@ class MetaPromptOptimizer(BaseOptimizer):
101
101
  adaptive_trial_threshold: Optional[float] = DEFAULT_ADAPTIVE_THRESHOLD,
102
102
  num_threads: int = 12,
103
103
  project_name: Optional[str] = None,
104
+ verbose: int = 1,
105
+ enable_context: bool = True,
104
106
  **model_kwargs,
105
107
  ):
106
108
  """
@@ -117,6 +119,8 @@ class MetaPromptOptimizer(BaseOptimizer):
117
119
  adaptive_trial_threshold: If not None, prompts scoring below `best_score * adaptive_trial_threshold` after initial trials won't get max trials.
118
120
  num_threads: Number of threads for parallel evaluation
119
121
  project_name: Optional project name for tracking
122
+ verbose: Controls internal logging/progress bars (0=off, 1=on).
123
+ enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
120
124
  **model_kwargs: Additional model parameters
121
125
  """
122
126
  super().__init__(model=model, project_name=project_name, **model_kwargs)
@@ -128,9 +132,12 @@ class MetaPromptOptimizer(BaseOptimizer):
128
132
  self.max_trials = max_trials_per_candidate
129
133
  self.adaptive_threshold = adaptive_trial_threshold
130
134
  self.num_threads = num_threads
135
+ self.verbose = verbose
131
136
  self.dataset = None
132
137
  self.task_config = None
133
138
  self._opik_client = opik_client.get_client_cached()
139
+ self.llm_call_counter = 0
140
+ self.enable_context = enable_context
134
141
  logger.debug(
135
142
  f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
136
143
  )
@@ -151,6 +158,7 @@ class MetaPromptOptimizer(BaseOptimizer):
151
158
  experiment_config: Optional[Dict] = None,
152
159
  n_samples: Optional[int] = None,
153
160
  optimization_id: Optional[str] = None,
161
+ verbose: int = 1,
154
162
  ) -> float:
155
163
  """
156
164
  Evaluate a prompt using the given dataset and metric configuration.
@@ -177,6 +185,7 @@ class MetaPromptOptimizer(BaseOptimizer):
177
185
  experiment_config=experiment_config,
178
186
  n_samples=n_samples,
179
187
  optimization_id=optimization_id,
188
+ verbose=self.verbose,
180
189
  )
181
190
 
182
191
  @_throttle.rate_limited(_rate_limiter)
@@ -188,12 +197,21 @@ class MetaPromptOptimizer(BaseOptimizer):
188
197
  optimization_id: Optional[str] = None,
189
198
  ) -> str:
190
199
  """Call the model with the given prompt and return the response."""
200
+ self.llm_call_counter += 1
191
201
  # Note: Basic retry logic could be added here using tenacity
192
202
  try:
193
203
  # Basic LLM parameters (e.g., temperature, max_tokens)
204
+ base_temperature = getattr(self, "temperature", 0.3)
205
+ base_max_tokens = getattr(self, "max_tokens", 1000)
206
+
207
+ # Use potentially different settings for reasoning calls
208
+ reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
209
+ # Increase max_tokens for reasoning to ensure JSON fits, unless already high
210
+ reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
211
+
194
212
  llm_config_params = {
195
- "temperature": getattr(self, "temperature", 0.3),
196
- "max_tokens": getattr(self, "max_tokens", 1000),
213
+ "temperature": reasoning_temperature if is_reasoning else base_temperature,
214
+ "max_tokens": reasoning_max_tokens,
197
215
  "top_p": getattr(self, "top_p", 1.0),
198
216
  "frequency_penalty": getattr(self, "frequency_penalty", 0.0),
199
217
  "presence_penalty": getattr(self, "presence_penalty", 0.0),
@@ -242,7 +260,10 @@ class MetaPromptOptimizer(BaseOptimizer):
242
260
  )
243
261
 
244
262
  response = litellm.completion(
245
- model=model_to_use, messages=messages, **final_call_params
263
+ model=model_to_use,
264
+ messages=messages,
265
+ num_retries=6,
266
+ **final_call_params
246
267
  )
247
268
  return response.choices[0].message.content
248
269
  except litellm.exceptions.RateLimitError as e:
@@ -271,6 +292,7 @@ class MetaPromptOptimizer(BaseOptimizer):
271
292
  experiment_config: Optional[Dict],
272
293
  n_samples: Optional[int],
273
294
  optimization_id: Optional[str] = None,
295
+ verbose: int = 1,
274
296
  ) -> float:
275
297
  # Calculate subset size for trials
276
298
  if not use_full_dataset:
@@ -429,6 +451,7 @@ class MetaPromptOptimizer(BaseOptimizer):
429
451
  n_samples=subset_size, # Use subset_size for trials, None for full dataset
430
452
  experiment_config=experiment_config,
431
453
  optimization_id=optimization_id,
454
+ verbose=self.verbose,
432
455
  )
433
456
  logger.debug(f"Evaluation score: {score:.4f}")
434
457
  return score
@@ -474,7 +497,9 @@ class MetaPromptOptimizer(BaseOptimizer):
474
497
  optimization = None
475
498
  try:
476
499
  optimization = self._opik_client.create_optimization(
477
- dataset_name=dataset.name, objective_name=metric_config.metric.name
500
+ dataset_name=dataset.name,
501
+ objective_name=metric_config.metric.name,
502
+ metadata={"optimizer": self.__class__.__name__},
478
503
  )
479
504
  logger.info(f"Created optimization with ID: {optimization.id}")
480
505
  except Exception as e:
@@ -519,6 +544,7 @@ class MetaPromptOptimizer(BaseOptimizer):
519
544
  self.auto_continue = auto_continue
520
545
  self.dataset = dataset
521
546
  self.task_config = task_config
547
+ self.llm_call_counter = 0 # Reset counter for run
522
548
 
523
549
  current_prompt = task_config.instruction_prompt
524
550
  experiment_config = experiment_config or {}
@@ -550,6 +576,7 @@ class MetaPromptOptimizer(BaseOptimizer):
550
576
  n_samples=n_samples,
551
577
  experiment_config=experiment_config,
552
578
  use_full_dataset=n_samples is None,
579
+ verbose=self.verbose,
553
580
  )
554
581
  best_score = initial_score
555
582
  best_prompt = current_prompt
@@ -617,6 +644,7 @@ class MetaPromptOptimizer(BaseOptimizer):
617
644
  n_samples=n_samples,
618
645
  use_full_dataset=False,
619
646
  experiment_config=experiment_config,
647
+ verbose=self.verbose,
620
648
  )
621
649
  scores.append(score)
622
650
  logger.debug(f"Trial {trial+1} score: {score:.4f}")
@@ -659,6 +687,7 @@ class MetaPromptOptimizer(BaseOptimizer):
659
687
  n_samples=n_samples,
660
688
  use_full_dataset=False,
661
689
  experiment_config=experiment_config,
690
+ verbose=self.verbose,
662
691
  )
663
692
  scores.append(score)
664
693
  logger.debug(
@@ -710,6 +739,7 @@ class MetaPromptOptimizer(BaseOptimizer):
710
739
  experiment_config=experiment_config,
711
740
  n_samples=n_samples,
712
741
  use_full_dataset=n_samples is None,
742
+ verbose=self.verbose,
713
743
  )
714
744
  logger.info(
715
745
  f"Final evaluation score for best candidate: {final_score_best_cand:.4f}"
@@ -879,6 +909,7 @@ class MetaPromptOptimizer(BaseOptimizer):
879
909
  score=best_score,
880
910
  metric_name=metric_config.metric.name,
881
911
  details=details,
912
+ llm_calls=self.llm_call_counter
882
913
  )
883
914
 
884
915
  def _get_task_context(self, metric_config: MetricConfig) -> str:
@@ -952,20 +983,35 @@ class MetaPromptOptimizer(BaseOptimizer):
952
983
 
953
984
  # Pass single metric_config
954
985
  history_context = self._build_history_context(previous_rounds)
955
- task_context = self._get_task_context(metric_config=metric_config)
986
+ task_context_str = ""
987
+ analysis_instruction = ""
988
+ metric_focus_instruction = ""
989
+ improvement_point_1 = ""
990
+
991
+ if self.enable_context:
992
+ task_context_str = self._get_task_context(metric_config=metric_config)
993
+ analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
994
+ metric_focus_instruction = f"Focus on improving the score for the metric: {metric_config.metric.name}."
995
+ improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
996
+ logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
997
+ else:
998
+ analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
999
+ metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
1000
+ improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
1001
+ logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
956
1002
 
957
1003
  user_prompt = f"""Current prompt: {current_prompt}
958
1004
  Current score: {best_score}
959
1005
  {history_context}
960
- {task_context}
1006
+ {task_context_str}
961
1007
 
962
- Analyze the example provided, the metric description, and the history of scores.
1008
+ {analysis_instruction}
963
1009
  Generate {self.num_prompts_per_round} improved versions of this prompt.
964
- Focus on improving the score for the metric: {metric_config.metric.name}.
1010
+ {metric_focus_instruction}
965
1011
  Each version should aim to:
966
- 1. Be more specific and clear about expectations based on the metric and task.
967
- 2. Provide necessary context and constraints.
968
- 3. Guide the model to produce the desired output format suitable for the metric.
1012
+ {improvement_point_1}
1013
+ 2. Provide necessary context and constraints (if applicable, without relying on disabled external context).
1014
+ 3. Guide the model to produce the desired output format suitable for the task.
969
1015
  4. Remove ambiguity and unnecessary elements.
970
1016
  5. Maintain conciseness while being complete.
971
1017