opik-optimizer 0.7.8__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +6 -4
- opik_optimizer/datasets/__init__.py +27 -0
- opik_optimizer/datasets/ai2_arc.py +44 -0
- opik_optimizer/datasets/cnn_dailymail.py +40 -0
- opik_optimizer/datasets/election_questions.py +36 -0
- opik_optimizer/datasets/gsm8k.py +40 -0
- opik_optimizer/datasets/halu_eval.py +43 -0
- opik_optimizer/datasets/hotpot_qa.py +67 -0
- opik_optimizer/datasets/medhallu.py +39 -0
- opik_optimizer/datasets/rag_hallucinations.py +41 -0
- opik_optimizer/datasets/ragbench.py +40 -0
- opik_optimizer/datasets/tiny_test.py +57 -0
- opik_optimizer/datasets/truthful_qa.py +107 -0
- opik_optimizer/demo/datasets.py +53 -607
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +3 -1
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +88 -17
- opik_optimizer/logging_config.py +1 -1
- opik_optimizer/meta_prompt_optimizer.py +57 -11
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +151 -13
- opik_optimizer/optimization_result.py +11 -0
- opik_optimizer/task_evaluator.py +6 -1
- opik_optimizer/utils.py +0 -52
- opik_optimizer-0.8.0.dist-info/METADATA +196 -0
- opik_optimizer-0.8.0.dist-info/RECORD +45 -0
- opik_optimizer-0.7.8.dist-info/METADATA +0 -174
- opik_optimizer-0.7.8.dist-info/RECORD +0 -33
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.0.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import optuna
|
|
5
5
|
import optuna.samplers
|
6
6
|
import logging
|
7
7
|
import json
|
8
|
+
from datetime import datetime
|
8
9
|
|
9
10
|
from opik import Dataset
|
10
11
|
from opik_optimizer.optimization_config import mappers
|
@@ -26,20 +27,6 @@ _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
|
26
27
|
logger = logging.getLogger(__name__)
|
27
28
|
|
28
29
|
|
29
|
-
@_throttle.rate_limited(_limiter)
|
30
|
-
def _call_model(model, messages, seed, model_kwargs):
|
31
|
-
model_kwargs = opik_litellm_monitor.try_add_opik_monitoring_to_params(model_kwargs)
|
32
|
-
|
33
|
-
response = litellm.completion(
|
34
|
-
model=model,
|
35
|
-
messages=messages,
|
36
|
-
seed=seed,
|
37
|
-
**model_kwargs,
|
38
|
-
)
|
39
|
-
|
40
|
-
return response
|
41
|
-
|
42
|
-
|
43
30
|
class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
44
31
|
def __init__(
|
45
32
|
self,
|
@@ -51,6 +38,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
51
38
|
n_threads: int = 8,
|
52
39
|
n_initial_prompts: int = 5,
|
53
40
|
n_iterations: int = 10,
|
41
|
+
verbose: int = 1,
|
54
42
|
**model_kwargs,
|
55
43
|
) -> None:
|
56
44
|
super().__init__(model, project_name, **model_kwargs)
|
@@ -60,9 +48,37 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
60
48
|
self.n_threads = n_threads
|
61
49
|
self.n_initial_prompts = n_initial_prompts
|
62
50
|
self.n_iterations = n_iterations
|
51
|
+
self.verbose = verbose
|
63
52
|
self._opik_client = opik.Opik()
|
53
|
+
self.llm_call_counter = 0
|
64
54
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
65
55
|
|
56
|
+
@_throttle.rate_limited(_limiter)
|
57
|
+
def _call_model(self, model, messages, seed, model_kwargs):
|
58
|
+
self.llm_call_counter += 1
|
59
|
+
|
60
|
+
current_model_kwargs = self.model_kwargs.copy()
|
61
|
+
current_model_kwargs.update(model_kwargs)
|
62
|
+
|
63
|
+
filtered_call_kwargs = current_model_kwargs.copy()
|
64
|
+
filtered_call_kwargs.pop('n_trials', None)
|
65
|
+
filtered_call_kwargs.pop('n_samples', None)
|
66
|
+
filtered_call_kwargs.pop('n_iterations', None)
|
67
|
+
filtered_call_kwargs.pop('min_examples', None)
|
68
|
+
filtered_call_kwargs.pop('max_examples', None)
|
69
|
+
filtered_call_kwargs.pop('n_initial_prompts', None)
|
70
|
+
|
71
|
+
final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
72
|
+
|
73
|
+
response = litellm.completion(
|
74
|
+
model=self.model,
|
75
|
+
messages=messages,
|
76
|
+
seed=seed,
|
77
|
+
num_retries=6,
|
78
|
+
**final_params_for_litellm,
|
79
|
+
)
|
80
|
+
return response
|
81
|
+
|
66
82
|
def _split_dataset(
|
67
83
|
self, dataset: List[Dict[str, Any]], train_ratio: float
|
68
84
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
@@ -96,6 +112,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
96
112
|
n_samples: int = None,
|
97
113
|
) -> optimization_result.OptimizationResult:
|
98
114
|
random.seed(self.seed)
|
115
|
+
self.llm_call_counter = 0
|
99
116
|
|
100
117
|
if not task_config.use_chat_prompt:
|
101
118
|
raise ValueError(
|
@@ -161,6 +178,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
161
178
|
project_name=self.project_name,
|
162
179
|
experiment_config=initial_eval_config,
|
163
180
|
optimization_id=optimization_id,
|
181
|
+
verbose=self.verbose,
|
164
182
|
)
|
165
183
|
logger.info(f"Initial (zero-shot) score: {initial_score:.4f}")
|
166
184
|
|
@@ -222,6 +240,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
222
240
|
project_name=self.project_name,
|
223
241
|
experiment_config=trial_config,
|
224
242
|
optimization_id=optimization_id,
|
243
|
+
verbose=self.verbose,
|
225
244
|
)
|
226
245
|
logger.debug(f"Trial {trial.number} score: {score:.4f}")
|
227
246
|
|
@@ -242,11 +261,59 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
242
261
|
except Exception as e:
|
243
262
|
logger.warning(f"Could not configure Optuna logging within optimizer: {e}")
|
244
263
|
|
264
|
+
# Explicitly create and seed the sampler for Optuna
|
245
265
|
sampler = optuna.samplers.TPESampler(seed=self.seed)
|
246
266
|
study = optuna.create_study(direction="maximize", sampler=sampler)
|
247
|
-
|
267
|
+
|
268
|
+
study.optimize(optimization_objective, n_trials=n_trials, show_progress_bar=(self.verbose >= 1))
|
248
269
|
logger.info("Optuna study finished.")
|
249
270
|
|
271
|
+
optuna_history_processed = []
|
272
|
+
for trial_idx, trial in enumerate(study.trials):
|
273
|
+
if trial.state == optuna.trial.TrialState.COMPLETE:
|
274
|
+
param_obj: Optional[prompt_parameter.ChatPromptParameter] = trial.user_attrs.get("param")
|
275
|
+
prompt_cand_display = None # Default to None
|
276
|
+
if param_obj and hasattr(param_obj, 'as_template') and callable(param_obj.as_template):
|
277
|
+
try:
|
278
|
+
# .format() on ChatPromptTemplate returns the list of messages
|
279
|
+
chat_messages_for_history = param_obj.as_template().format()
|
280
|
+
prompt_cand_display = json.dumps(chat_messages_for_history)
|
281
|
+
except Exception as e_param_format:
|
282
|
+
logger.warning(f"Trial {trial.number}: Error formatting prompt from param_obj: {e_param_format}")
|
283
|
+
prompt_cand_display = "Error: Could not format prompt content."
|
284
|
+
elif not param_obj:
|
285
|
+
logger.warning(f"Trial {trial.number}: 'param' object not found in user_attrs.")
|
286
|
+
prompt_cand_display = "Error: Prompt data missing in trial."
|
287
|
+
else:
|
288
|
+
logger.warning(f"Trial {trial.number}: 'param' object is not of expected type or lacks methods.")
|
289
|
+
prompt_cand_display = "Error: Invalid prompt data structure in trial."
|
290
|
+
|
291
|
+
score_val = trial.value # This can be None if trial failed to produce a score
|
292
|
+
duration_val = None
|
293
|
+
if trial.datetime_complete and trial.datetime_start:
|
294
|
+
duration_val = (trial.datetime_complete - trial.datetime_start).total_seconds()
|
295
|
+
|
296
|
+
iter_detail = {
|
297
|
+
"iteration": trial.number + 1,
|
298
|
+
"timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
|
299
|
+
"prompt_candidate": prompt_cand_display,
|
300
|
+
"parameters_used": {
|
301
|
+
"optuna_params": trial.params,
|
302
|
+
"example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
|
303
|
+
},
|
304
|
+
"scores": [{
|
305
|
+
"metric_name": metric_config.metric.name,
|
306
|
+
"score": score_val, # Can be None
|
307
|
+
"opik_evaluation_id": None # TODO
|
308
|
+
}],
|
309
|
+
"tokens_used": None, # TODO
|
310
|
+
"cost": None, # TODO
|
311
|
+
"duration_seconds": duration_val,
|
312
|
+
}
|
313
|
+
optuna_history_processed.append(iter_detail)
|
314
|
+
else:
|
315
|
+
logger.warning(f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}")
|
316
|
+
|
250
317
|
best_trial = study.best_trial
|
251
318
|
best_score = best_trial.value
|
252
319
|
best_n_examples = best_trial.params["n_examples"]
|
@@ -279,6 +346,8 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
279
346
|
"model": self.model,
|
280
347
|
"temperature": self.model_kwargs.get("temperature"),
|
281
348
|
},
|
349
|
+
history=optuna_history_processed,
|
350
|
+
llm_calls=self.llm_call_counter
|
282
351
|
)
|
283
352
|
|
284
353
|
def optimize_prompt(
|
@@ -295,6 +364,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
295
364
|
optimization = self._opik_client.create_optimization(
|
296
365
|
dataset_name=dataset.name,
|
297
366
|
objective_name=metric_config.metric.name,
|
367
|
+
metadata={"optimizer": self.__class__.__name__},
|
298
368
|
)
|
299
369
|
except Exception:
|
300
370
|
logger.warning(
|
@@ -389,6 +459,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
389
459
|
num_threads=self.n_threads,
|
390
460
|
project_name=self.project_name,
|
391
461
|
experiment_config=experiment_config,
|
462
|
+
verbose=self.verbose,
|
392
463
|
)
|
393
464
|
logger.debug(f"Evaluation score: {score:.4f}")
|
394
465
|
|
@@ -400,11 +471,11 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
400
471
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
|
401
472
|
prompt_ = template.format(**dataset_item)
|
402
473
|
|
403
|
-
response = _call_model(
|
474
|
+
response = self._call_model(
|
404
475
|
model=self.model,
|
405
476
|
messages=prompt_,
|
406
477
|
seed=self.seed,
|
407
|
-
model_kwargs=self.model_kwargs
|
478
|
+
model_kwargs=self.model_kwargs
|
408
479
|
)
|
409
480
|
|
410
481
|
return {
|
opik_optimizer/logging_config.py
CHANGED
@@ -63,7 +63,7 @@ def setup_logging(
|
|
63
63
|
_logging_configured = True
|
64
64
|
|
65
65
|
# Use level name provided by rich handler by default
|
66
|
-
package_logger.info(f"Opik Optimizer logging configured to level: [bold cyan]{logging.getLevelName(level)}[/bold cyan]")
|
66
|
+
package_logger.info(f"Opik Agent Optimizer logging configured to level: [bold cyan]{logging.getLevelName(level)}[/bold cyan]")
|
67
67
|
|
68
68
|
# Ensure logger obtained after setup can be used immediately if needed
|
69
69
|
logger = logging.getLogger(__name__)
|
@@ -101,6 +101,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
101
101
|
adaptive_trial_threshold: Optional[float] = DEFAULT_ADAPTIVE_THRESHOLD,
|
102
102
|
num_threads: int = 12,
|
103
103
|
project_name: Optional[str] = None,
|
104
|
+
verbose: int = 1,
|
105
|
+
enable_context: bool = True,
|
104
106
|
**model_kwargs,
|
105
107
|
):
|
106
108
|
"""
|
@@ -117,6 +119,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
117
119
|
adaptive_trial_threshold: If not None, prompts scoring below `best_score * adaptive_trial_threshold` after initial trials won't get max trials.
|
118
120
|
num_threads: Number of threads for parallel evaluation
|
119
121
|
project_name: Optional project name for tracking
|
122
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
123
|
+
enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
|
120
124
|
**model_kwargs: Additional model parameters
|
121
125
|
"""
|
122
126
|
super().__init__(model=model, project_name=project_name, **model_kwargs)
|
@@ -128,9 +132,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
128
132
|
self.max_trials = max_trials_per_candidate
|
129
133
|
self.adaptive_threshold = adaptive_trial_threshold
|
130
134
|
self.num_threads = num_threads
|
135
|
+
self.verbose = verbose
|
131
136
|
self.dataset = None
|
132
137
|
self.task_config = None
|
133
138
|
self._opik_client = opik_client.get_client_cached()
|
139
|
+
self.llm_call_counter = 0
|
140
|
+
self.enable_context = enable_context
|
134
141
|
logger.debug(
|
135
142
|
f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
|
136
143
|
)
|
@@ -151,6 +158,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
151
158
|
experiment_config: Optional[Dict] = None,
|
152
159
|
n_samples: Optional[int] = None,
|
153
160
|
optimization_id: Optional[str] = None,
|
161
|
+
verbose: int = 1,
|
154
162
|
) -> float:
|
155
163
|
"""
|
156
164
|
Evaluate a prompt using the given dataset and metric configuration.
|
@@ -177,6 +185,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
177
185
|
experiment_config=experiment_config,
|
178
186
|
n_samples=n_samples,
|
179
187
|
optimization_id=optimization_id,
|
188
|
+
verbose=self.verbose,
|
180
189
|
)
|
181
190
|
|
182
191
|
@_throttle.rate_limited(_rate_limiter)
|
@@ -188,12 +197,21 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
188
197
|
optimization_id: Optional[str] = None,
|
189
198
|
) -> str:
|
190
199
|
"""Call the model with the given prompt and return the response."""
|
200
|
+
self.llm_call_counter += 1
|
191
201
|
# Note: Basic retry logic could be added here using tenacity
|
192
202
|
try:
|
193
203
|
# Basic LLM parameters (e.g., temperature, max_tokens)
|
204
|
+
base_temperature = getattr(self, "temperature", 0.3)
|
205
|
+
base_max_tokens = getattr(self, "max_tokens", 1000)
|
206
|
+
|
207
|
+
# Use potentially different settings for reasoning calls
|
208
|
+
reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
|
209
|
+
# Increase max_tokens for reasoning to ensure JSON fits, unless already high
|
210
|
+
reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
|
211
|
+
|
194
212
|
llm_config_params = {
|
195
|
-
"temperature":
|
196
|
-
"max_tokens":
|
213
|
+
"temperature": reasoning_temperature if is_reasoning else base_temperature,
|
214
|
+
"max_tokens": reasoning_max_tokens,
|
197
215
|
"top_p": getattr(self, "top_p", 1.0),
|
198
216
|
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
199
217
|
"presence_penalty": getattr(self, "presence_penalty", 0.0),
|
@@ -242,7 +260,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
242
260
|
)
|
243
261
|
|
244
262
|
response = litellm.completion(
|
245
|
-
model=model_to_use,
|
263
|
+
model=model_to_use,
|
264
|
+
messages=messages,
|
265
|
+
num_retries=6,
|
266
|
+
**final_call_params
|
246
267
|
)
|
247
268
|
return response.choices[0].message.content
|
248
269
|
except litellm.exceptions.RateLimitError as e:
|
@@ -271,6 +292,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
271
292
|
experiment_config: Optional[Dict],
|
272
293
|
n_samples: Optional[int],
|
273
294
|
optimization_id: Optional[str] = None,
|
295
|
+
verbose: int = 1,
|
274
296
|
) -> float:
|
275
297
|
# Calculate subset size for trials
|
276
298
|
if not use_full_dataset:
|
@@ -429,6 +451,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
429
451
|
n_samples=subset_size, # Use subset_size for trials, None for full dataset
|
430
452
|
experiment_config=experiment_config,
|
431
453
|
optimization_id=optimization_id,
|
454
|
+
verbose=self.verbose,
|
432
455
|
)
|
433
456
|
logger.debug(f"Evaluation score: {score:.4f}")
|
434
457
|
return score
|
@@ -474,7 +497,9 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
474
497
|
optimization = None
|
475
498
|
try:
|
476
499
|
optimization = self._opik_client.create_optimization(
|
477
|
-
dataset_name=dataset.name,
|
500
|
+
dataset_name=dataset.name,
|
501
|
+
objective_name=metric_config.metric.name,
|
502
|
+
metadata={"optimizer": self.__class__.__name__},
|
478
503
|
)
|
479
504
|
logger.info(f"Created optimization with ID: {optimization.id}")
|
480
505
|
except Exception as e:
|
@@ -519,6 +544,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
519
544
|
self.auto_continue = auto_continue
|
520
545
|
self.dataset = dataset
|
521
546
|
self.task_config = task_config
|
547
|
+
self.llm_call_counter = 0 # Reset counter for run
|
522
548
|
|
523
549
|
current_prompt = task_config.instruction_prompt
|
524
550
|
experiment_config = experiment_config or {}
|
@@ -550,6 +576,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
550
576
|
n_samples=n_samples,
|
551
577
|
experiment_config=experiment_config,
|
552
578
|
use_full_dataset=n_samples is None,
|
579
|
+
verbose=self.verbose,
|
553
580
|
)
|
554
581
|
best_score = initial_score
|
555
582
|
best_prompt = current_prompt
|
@@ -617,6 +644,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
617
644
|
n_samples=n_samples,
|
618
645
|
use_full_dataset=False,
|
619
646
|
experiment_config=experiment_config,
|
647
|
+
verbose=self.verbose,
|
620
648
|
)
|
621
649
|
scores.append(score)
|
622
650
|
logger.debug(f"Trial {trial+1} score: {score:.4f}")
|
@@ -659,6 +687,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
659
687
|
n_samples=n_samples,
|
660
688
|
use_full_dataset=False,
|
661
689
|
experiment_config=experiment_config,
|
690
|
+
verbose=self.verbose,
|
662
691
|
)
|
663
692
|
scores.append(score)
|
664
693
|
logger.debug(
|
@@ -710,6 +739,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
710
739
|
experiment_config=experiment_config,
|
711
740
|
n_samples=n_samples,
|
712
741
|
use_full_dataset=n_samples is None,
|
742
|
+
verbose=self.verbose,
|
713
743
|
)
|
714
744
|
logger.info(
|
715
745
|
f"Final evaluation score for best candidate: {final_score_best_cand:.4f}"
|
@@ -879,6 +909,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
879
909
|
score=best_score,
|
880
910
|
metric_name=metric_config.metric.name,
|
881
911
|
details=details,
|
912
|
+
llm_calls=self.llm_call_counter
|
882
913
|
)
|
883
914
|
|
884
915
|
def _get_task_context(self, metric_config: MetricConfig) -> str:
|
@@ -952,20 +983,35 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
952
983
|
|
953
984
|
# Pass single metric_config
|
954
985
|
history_context = self._build_history_context(previous_rounds)
|
955
|
-
|
986
|
+
task_context_str = ""
|
987
|
+
analysis_instruction = ""
|
988
|
+
metric_focus_instruction = ""
|
989
|
+
improvement_point_1 = ""
|
990
|
+
|
991
|
+
if self.enable_context:
|
992
|
+
task_context_str = self._get_task_context(metric_config=metric_config)
|
993
|
+
analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
|
994
|
+
metric_focus_instruction = f"Focus on improving the score for the metric: {metric_config.metric.name}."
|
995
|
+
improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
|
996
|
+
logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
|
997
|
+
else:
|
998
|
+
analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
|
999
|
+
metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
|
1000
|
+
improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
|
1001
|
+
logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
|
956
1002
|
|
957
1003
|
user_prompt = f"""Current prompt: {current_prompt}
|
958
1004
|
Current score: {best_score}
|
959
1005
|
{history_context}
|
960
|
-
{
|
1006
|
+
{task_context_str}
|
961
1007
|
|
962
|
-
|
1008
|
+
{analysis_instruction}
|
963
1009
|
Generate {self.num_prompts_per_round} improved versions of this prompt.
|
964
|
-
|
1010
|
+
{metric_focus_instruction}
|
965
1011
|
Each version should aim to:
|
966
|
-
|
967
|
-
2. Provide necessary context and constraints.
|
968
|
-
3. Guide the model to produce the desired output format suitable for the
|
1012
|
+
{improvement_point_1}
|
1013
|
+
2. Provide necessary context and constraints (if applicable, without relying on disabled external context).
|
1014
|
+
3. Guide the model to produce the desired output format suitable for the task.
|
969
1015
|
4. Remove ambiguity and unnecessary elements.
|
970
1016
|
5. Maintain conciseness while being complete.
|
971
1017
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Any, Dict, List, Tuple, Union, Optional, Literal
|
2
2
|
import os
|
3
3
|
import random
|
4
|
+
from datetime import datetime
|
4
5
|
|
5
6
|
import opik
|
6
7
|
|
@@ -37,11 +38,13 @@ logger = logging.getLogger(__name__) # Inherits config from setup_logging
|
|
37
38
|
|
38
39
|
|
39
40
|
class MiproOptimizer(BaseOptimizer):
|
40
|
-
def __init__(self, model, project_name: Optional[str] = None, **model_kwargs):
|
41
|
-
super().__init__(model, project_name, **model_kwargs)
|
41
|
+
def __init__(self, model, project_name: Optional[str] = None, verbose: int = 1, **model_kwargs):
|
42
|
+
super().__init__(model, project_name, verbose=verbose, **model_kwargs)
|
42
43
|
self.tools = []
|
43
44
|
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
44
45
|
self.model_kwargs["model"] = self.model
|
46
|
+
self.llm_call_counter = 0
|
47
|
+
# FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
|
45
48
|
lm = LM(**self.model_kwargs)
|
46
49
|
opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
|
47
50
|
dspy.configure(lm=lm, callbacks=[opik_callback])
|
@@ -56,6 +59,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
56
59
|
n_samples: int = 10,
|
57
60
|
dataset_item_ids: Optional[List[str]] = None,
|
58
61
|
experiment_config: Optional[Dict] = None,
|
62
|
+
verbose: int = 1,
|
59
63
|
**kwargs,
|
60
64
|
) -> float:
|
61
65
|
"""
|
@@ -69,6 +73,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
69
73
|
n_samples: number of items to test in the dataset
|
70
74
|
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
71
75
|
experiment_config: Optional configuration for the experiment
|
76
|
+
verbose: Verbosity level
|
72
77
|
**kwargs: Additional arguments for evaluation
|
73
78
|
|
74
79
|
Returns:
|
@@ -76,10 +81,14 @@ class MiproOptimizer(BaseOptimizer):
|
|
76
81
|
"""
|
77
82
|
# FIMXE: call super when it is ready
|
78
83
|
# FIXME: Intermediate values:
|
84
|
+
self.llm_call_counter += 1
|
79
85
|
metric = metric_config.metric
|
80
86
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
|
81
87
|
output_key = task_config.output_dataset_field
|
82
88
|
|
89
|
+
# Kwargs might contain n_samples, passed from run_benchmark.py
|
90
|
+
n_samples = kwargs.pop("n_samples", None) # Get n_samples from kwargs if present
|
91
|
+
|
83
92
|
if isinstance(dataset, str):
|
84
93
|
opik_client = opik.Opik(project_name=self.project_name)
|
85
94
|
dataset = opik_client.get_dataset(dataset)
|
@@ -144,12 +153,32 @@ class MiproOptimizer(BaseOptimizer):
|
|
144
153
|
|
145
154
|
return result
|
146
155
|
|
147
|
-
|
148
|
-
|
149
|
-
|
156
|
+
# Robust n_samples handling for selecting dataset_item_ids
|
157
|
+
dataset_items_for_eval = dataset.get_items()
|
158
|
+
num_total_items = len(dataset_items_for_eval)
|
159
|
+
dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
|
150
160
|
|
151
|
-
|
152
|
-
dataset_item_ids
|
161
|
+
if n_samples is not None: # If n_samples is specified by the caller (run_benchmark.py)
|
162
|
+
if dataset_item_ids is not None:
|
163
|
+
# This case should ideally be an error or a clear precedence rule.
|
164
|
+
# For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
|
165
|
+
logger.warning("MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids.")
|
166
|
+
# dataset_item_ids_to_use is already dataset_item_ids
|
167
|
+
elif n_samples > num_total_items:
|
168
|
+
logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items.")
|
169
|
+
dataset_item_ids_to_use = None # opik.evaluation.evaluate handles None as all items
|
170
|
+
elif n_samples <= 0:
|
171
|
+
logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items.")
|
172
|
+
dataset_item_ids_to_use = None
|
173
|
+
else:
|
174
|
+
# n_samples is valid and dataset_item_ids was not provided, so sample now.
|
175
|
+
all_ids = [item["id"] for item in dataset_items_for_eval]
|
176
|
+
dataset_item_ids_to_use = random.sample(all_ids, n_samples)
|
177
|
+
logger.info(f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation.")
|
178
|
+
else: # n_samples is None
|
179
|
+
if dataset_item_ids is None:
|
180
|
+
logger.info(f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items.")
|
181
|
+
# dataset_item_ids_to_use is already dataset_item_ids (which could be None)
|
153
182
|
|
154
183
|
experiment_config = experiment_config or {}
|
155
184
|
experiment_config = {
|
@@ -171,9 +200,10 @@ class MiproOptimizer(BaseOptimizer):
|
|
171
200
|
# "reference" needs to match metric
|
172
201
|
scoring_key_mapping={"reference": output_key},
|
173
202
|
task_threads=self.num_threads,
|
174
|
-
dataset_item_ids=
|
203
|
+
dataset_item_ids=dataset_item_ids_to_use,
|
175
204
|
project_name=self.project_name,
|
176
205
|
experiment_config=experiment_config,
|
206
|
+
verbose=verbose,
|
177
207
|
)
|
178
208
|
|
179
209
|
# Calculate average score across all metrics
|
@@ -207,6 +237,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
207
237
|
optimization = self._opik_client.create_optimization(
|
208
238
|
dataset_name=dataset.name,
|
209
239
|
objective_name=metric_config.metric.name,
|
240
|
+
metadata={"optimizer": self.__class__.__name__},
|
210
241
|
)
|
211
242
|
except Exception:
|
212
243
|
logger.warning(
|
@@ -284,13 +315,14 @@ class MiproOptimizer(BaseOptimizer):
|
|
284
315
|
**kwargs,
|
285
316
|
) -> None:
|
286
317
|
# FIXME: Intermediate values:
|
318
|
+
self.llm_call_counter = 0
|
287
319
|
metric = metric_config.metric
|
288
320
|
prompt = task_config.instruction_prompt
|
289
321
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all
|
290
322
|
output_key = task_config.output_dataset_field
|
291
323
|
self.tools = task_config.tools
|
292
324
|
self.num_candidates = num_candidates
|
293
|
-
self.seed =
|
325
|
+
self.seed = 42
|
294
326
|
self.input_key = input_key
|
295
327
|
self.output_key = output_key
|
296
328
|
self.prompt = prompt
|
@@ -347,7 +379,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
347
379
|
metric=self.metric_function,
|
348
380
|
auto=self.auto,
|
349
381
|
num_threads=self.num_threads,
|
350
|
-
verbose=
|
382
|
+
verbose=(self.verbose == 1),
|
351
383
|
num_candidates=self.num_candidates,
|
352
384
|
seed=self.seed,
|
353
385
|
opik_prompt_task_config=task_config,
|
@@ -373,6 +405,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
373
405
|
"""
|
374
406
|
Continue to look for optimizations
|
375
407
|
"""
|
408
|
+
if not hasattr(self, 'optimizer') or not self.optimizer:
|
409
|
+
raise RuntimeError("MiproOptimizer not prepared. Call prepare_optimize_prompt first.")
|
410
|
+
|
376
411
|
self.results = self.optimizer.compile(
|
377
412
|
student=self.module,
|
378
413
|
trainset=self.trainset,
|
@@ -385,12 +420,114 @@ class MiproOptimizer(BaseOptimizer):
|
|
385
420
|
key=lambda item: item["score"],
|
386
421
|
reverse=True,
|
387
422
|
)
|
423
|
+
|
424
|
+
mipro_history_processed = []
|
425
|
+
# self.num_candidates is set in prepare_optimize_prompt, defaults to 10
|
426
|
+
# If self.num_candidates is 0 or None, this logic might break or be odd.
|
427
|
+
# Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
|
428
|
+
num_candidates_per_round = self.num_candidates if hasattr(self, 'num_candidates') and self.num_candidates and self.num_candidates > 0 else 1
|
429
|
+
|
430
|
+
for i, candidate_data in enumerate(self.results.candidate_programs):
|
431
|
+
program_module = candidate_data.get("program")
|
432
|
+
instruction = "N/A"
|
433
|
+
if hasattr(program_module, 'signature') and hasattr(program_module.signature, 'instructions'):
|
434
|
+
instruction = program_module.signature.instructions
|
435
|
+
elif hasattr(program_module, 'extended_signature') and hasattr(program_module.extended_signature, 'instructions'):
|
436
|
+
instruction = program_module.extended_signature.instructions
|
437
|
+
elif hasattr(program_module, 'predictor') and hasattr(program_module.predictor, 'signature') and hasattr(program_module.predictor.signature, 'instructions'):
|
438
|
+
instruction = program_module.predictor.signature.instructions
|
439
|
+
|
440
|
+
# Remove R and C calculation for Mipro as its history is flat
|
441
|
+
# current_round_number = (i // num_candidates_per_round) + 1
|
442
|
+
# current_candidate_in_round = (i % num_candidates_per_round) + 1
|
443
|
+
|
444
|
+
iter_detail = {
|
445
|
+
"iteration": i + 1,
|
446
|
+
# "round_number": current_round_number, # Remove round_number
|
447
|
+
# "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
|
448
|
+
"timestamp": datetime.now().isoformat(),
|
449
|
+
"prompt_candidate": instruction,
|
450
|
+
"parameters_used": {
|
451
|
+
"program_summary": str(program_module)[:500]
|
452
|
+
},
|
453
|
+
"scores": [], # Initialize scores list
|
454
|
+
"tokens_used": None, # TODO: add tokens_used
|
455
|
+
"cost": None, # TODO: add cost
|
456
|
+
"duration_seconds": None, # TODO: add duration_seconds
|
457
|
+
}
|
458
|
+
|
459
|
+
current_score = candidate_data.get("score")
|
460
|
+
metric_name_for_history = self.opik_metric.name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric"
|
461
|
+
|
462
|
+
# Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
|
463
|
+
# For now, specifically targeting Levenshtein-like metrics
|
464
|
+
if isinstance(current_score, (float, int)) and \
|
465
|
+
("levenshtein" in metric_name_for_history.lower() or "similarity" in metric_name_for_history.lower()):
|
466
|
+
# Assuming scores like 32.4 are 0-1 scores scaled by 100
|
467
|
+
if abs(current_score) > 1.0: # A simple check to see if it looks scaled
|
468
|
+
logger.debug(f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100.")
|
469
|
+
current_score /= 100.0
|
470
|
+
|
471
|
+
iter_detail["scores"].append({
|
472
|
+
"metric_name": metric_name_for_history,
|
473
|
+
"score": current_score,
|
474
|
+
"opik_evaluation_id": None # TODO: add opik_evaluation_id
|
475
|
+
})
|
476
|
+
mipro_history_processed.append(iter_detail)
|
477
|
+
|
478
|
+
if not self.best_programs:
|
479
|
+
logger.warning("MIPRO compile returned no candidate programs.")
|
480
|
+
return OptimizationResult(
|
481
|
+
optimizer="MiproOptimizer",
|
482
|
+
prompt=self.prompt,
|
483
|
+
score=0.0,
|
484
|
+
metric_name=self.opik_metric.name if hasattr(self, 'opik_metric') else "unknown_metric",
|
485
|
+
details={"error": "No candidate programs generated by MIPRO"},
|
486
|
+
history=mipro_history_processed,
|
487
|
+
llm_calls=self.llm_call_counter
|
488
|
+
)
|
489
|
+
|
388
490
|
self.module = self.get_best().details["program"]
|
389
|
-
|
491
|
+
best_program_details = self.get_best()
|
492
|
+
|
493
|
+
# Unscale the main score if necessary, similar to history scores
|
494
|
+
final_best_score = best_program_details.score
|
495
|
+
final_metric_name = best_program_details.metric_name
|
496
|
+
if isinstance(final_best_score, (float, int)) and \
|
497
|
+
final_metric_name and \
|
498
|
+
("levenshtein" in final_metric_name.lower() or "similarity" in final_metric_name.lower()):
|
499
|
+
if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
|
500
|
+
logger.debug(f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100.")
|
501
|
+
final_best_score /= 100.0
|
502
|
+
|
503
|
+
return OptimizationResult(
|
504
|
+
optimizer="MiproOptimizer",
|
505
|
+
prompt=best_program_details.prompt,
|
506
|
+
tool_prompts=best_program_details.tool_prompts,
|
507
|
+
score=final_best_score, # Use the potentially unscaled score
|
508
|
+
metric_name=final_metric_name,
|
509
|
+
demonstrations=best_program_details.demonstrations,
|
510
|
+
details=best_program_details.details,
|
511
|
+
history=mipro_history_processed,
|
512
|
+
llm_calls=self.llm_call_counter
|
513
|
+
)
|
390
514
|
|
391
515
|
def get_best(self, position: int = 0) -> OptimizationResult:
|
516
|
+
if not hasattr(self, 'best_programs') or not self.best_programs:
|
517
|
+
logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
|
518
|
+
return OptimizationResult(
|
519
|
+
optimizer="MiproOptimizer",
|
520
|
+
prompt=getattr(self, 'prompt', "Error: Initial prompt not found"),
|
521
|
+
score=0.0,
|
522
|
+
metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
|
523
|
+
details={"error": "No programs generated or compile failed"},
|
524
|
+
history=[],
|
525
|
+
llm_calls=self.llm_call_counter
|
526
|
+
)
|
527
|
+
|
392
528
|
score = self.best_programs[position]["score"]
|
393
|
-
|
529
|
+
program_module = self.best_programs[position]["program"]
|
530
|
+
state = program_module.dump_state()
|
394
531
|
if self.tools:
|
395
532
|
tool_names = [tool.__name__ for tool in self.tools]
|
396
533
|
tool_prompts = get_tool_prompts(
|
@@ -410,5 +547,6 @@ class MiproOptimizer(BaseOptimizer):
|
|
410
547
|
score=score,
|
411
548
|
metric_name=self.opik_metric.name,
|
412
549
|
demonstrations=demos,
|
413
|
-
details={"program":
|
550
|
+
details={"program": program_module},
|
551
|
+
llm_calls=self.llm_call_counter
|
414
552
|
)
|