opik-optimizer 0.7.8__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +6 -4
- opik_optimizer/data/hotpot-500.json +501 -1001
- opik_optimizer/datasets/__init__.py +27 -0
- opik_optimizer/datasets/ai2_arc.py +44 -0
- opik_optimizer/datasets/cnn_dailymail.py +40 -0
- opik_optimizer/datasets/election_questions.py +36 -0
- opik_optimizer/datasets/gsm8k.py +40 -0
- opik_optimizer/datasets/halu_eval.py +43 -0
- opik_optimizer/datasets/hotpot_qa.py +68 -0
- opik_optimizer/datasets/medhallu.py +39 -0
- opik_optimizer/datasets/rag_hallucinations.py +41 -0
- opik_optimizer/datasets/ragbench.py +40 -0
- opik_optimizer/datasets/tiny_test.py +57 -0
- opik_optimizer/datasets/truthful_qa.py +107 -0
- opik_optimizer/demo/datasets.py +53 -607
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +3 -1
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -19
- opik_optimizer/logging_config.py +1 -1
- opik_optimizer/meta_prompt_optimizer.py +60 -14
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +151 -13
- opik_optimizer/optimization_result.py +11 -0
- opik_optimizer/task_evaluator.py +6 -1
- opik_optimizer/utils.py +0 -52
- opik_optimizer-0.8.1.dist-info/METADATA +196 -0
- opik_optimizer-0.8.1.dist-info/RECORD +45 -0
- opik_optimizer-0.7.8.dist-info/METADATA +0 -174
- opik_optimizer-0.7.8.dist-info/RECORD +0 -33
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import optuna
|
|
5
5
|
import optuna.samplers
|
6
6
|
import logging
|
7
7
|
import json
|
8
|
+
from datetime import datetime
|
8
9
|
|
9
10
|
from opik import Dataset
|
10
11
|
from opik_optimizer.optimization_config import mappers
|
@@ -26,20 +27,6 @@ _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
|
26
27
|
logger = logging.getLogger(__name__)
|
27
28
|
|
28
29
|
|
29
|
-
@_throttle.rate_limited(_limiter)
|
30
|
-
def _call_model(model, messages, seed, model_kwargs):
|
31
|
-
model_kwargs = opik_litellm_monitor.try_add_opik_monitoring_to_params(model_kwargs)
|
32
|
-
|
33
|
-
response = litellm.completion(
|
34
|
-
model=model,
|
35
|
-
messages=messages,
|
36
|
-
seed=seed,
|
37
|
-
**model_kwargs,
|
38
|
-
)
|
39
|
-
|
40
|
-
return response
|
41
|
-
|
42
|
-
|
43
30
|
class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
44
31
|
def __init__(
|
45
32
|
self,
|
@@ -51,6 +38,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
51
38
|
n_threads: int = 8,
|
52
39
|
n_initial_prompts: int = 5,
|
53
40
|
n_iterations: int = 10,
|
41
|
+
verbose: int = 1,
|
54
42
|
**model_kwargs,
|
55
43
|
) -> None:
|
56
44
|
super().__init__(model, project_name, **model_kwargs)
|
@@ -60,9 +48,37 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
60
48
|
self.n_threads = n_threads
|
61
49
|
self.n_initial_prompts = n_initial_prompts
|
62
50
|
self.n_iterations = n_iterations
|
51
|
+
self.verbose = verbose
|
63
52
|
self._opik_client = opik.Opik()
|
53
|
+
self.llm_call_counter = 0
|
64
54
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
65
55
|
|
56
|
+
@_throttle.rate_limited(_limiter)
|
57
|
+
def _call_model(self, model, messages, seed, model_kwargs):
|
58
|
+
self.llm_call_counter += 1
|
59
|
+
|
60
|
+
current_model_kwargs = self.model_kwargs.copy()
|
61
|
+
current_model_kwargs.update(model_kwargs)
|
62
|
+
|
63
|
+
filtered_call_kwargs = current_model_kwargs.copy()
|
64
|
+
filtered_call_kwargs.pop('n_trials', None)
|
65
|
+
filtered_call_kwargs.pop('n_samples', None)
|
66
|
+
filtered_call_kwargs.pop('n_iterations', None)
|
67
|
+
filtered_call_kwargs.pop('min_examples', None)
|
68
|
+
filtered_call_kwargs.pop('max_examples', None)
|
69
|
+
filtered_call_kwargs.pop('n_initial_prompts', None)
|
70
|
+
|
71
|
+
final_params_for_litellm = opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
72
|
+
|
73
|
+
response = litellm.completion(
|
74
|
+
model=self.model,
|
75
|
+
messages=messages,
|
76
|
+
seed=seed,
|
77
|
+
num_retries=6,
|
78
|
+
**final_params_for_litellm,
|
79
|
+
)
|
80
|
+
return response
|
81
|
+
|
66
82
|
def _split_dataset(
|
67
83
|
self, dataset: List[Dict[str, Any]], train_ratio: float
|
68
84
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
@@ -96,6 +112,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
96
112
|
n_samples: int = None,
|
97
113
|
) -> optimization_result.OptimizationResult:
|
98
114
|
random.seed(self.seed)
|
115
|
+
self.llm_call_counter = 0
|
99
116
|
|
100
117
|
if not task_config.use_chat_prompt:
|
101
118
|
raise ValueError(
|
@@ -161,6 +178,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
161
178
|
project_name=self.project_name,
|
162
179
|
experiment_config=initial_eval_config,
|
163
180
|
optimization_id=optimization_id,
|
181
|
+
verbose=self.verbose,
|
164
182
|
)
|
165
183
|
logger.info(f"Initial (zero-shot) score: {initial_score:.4f}")
|
166
184
|
|
@@ -222,6 +240,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
222
240
|
project_name=self.project_name,
|
223
241
|
experiment_config=trial_config,
|
224
242
|
optimization_id=optimization_id,
|
243
|
+
verbose=self.verbose,
|
225
244
|
)
|
226
245
|
logger.debug(f"Trial {trial.number} score: {score:.4f}")
|
227
246
|
|
@@ -242,11 +261,59 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
242
261
|
except Exception as e:
|
243
262
|
logger.warning(f"Could not configure Optuna logging within optimizer: {e}")
|
244
263
|
|
264
|
+
# Explicitly create and seed the sampler for Optuna
|
245
265
|
sampler = optuna.samplers.TPESampler(seed=self.seed)
|
246
266
|
study = optuna.create_study(direction="maximize", sampler=sampler)
|
247
|
-
|
267
|
+
|
268
|
+
study.optimize(optimization_objective, n_trials=n_trials, show_progress_bar=(self.verbose >= 1))
|
248
269
|
logger.info("Optuna study finished.")
|
249
270
|
|
271
|
+
optuna_history_processed = []
|
272
|
+
for trial_idx, trial in enumerate(study.trials):
|
273
|
+
if trial.state == optuna.trial.TrialState.COMPLETE:
|
274
|
+
param_obj: Optional[prompt_parameter.ChatPromptParameter] = trial.user_attrs.get("param")
|
275
|
+
prompt_cand_display = None # Default to None
|
276
|
+
if param_obj and hasattr(param_obj, 'as_template') and callable(param_obj.as_template):
|
277
|
+
try:
|
278
|
+
# .format() on ChatPromptTemplate returns the list of messages
|
279
|
+
chat_messages_for_history = param_obj.as_template().format()
|
280
|
+
prompt_cand_display = json.dumps(chat_messages_for_history)
|
281
|
+
except Exception as e_param_format:
|
282
|
+
logger.warning(f"Trial {trial.number}: Error formatting prompt from param_obj: {e_param_format}")
|
283
|
+
prompt_cand_display = "Error: Could not format prompt content."
|
284
|
+
elif not param_obj:
|
285
|
+
logger.warning(f"Trial {trial.number}: 'param' object not found in user_attrs.")
|
286
|
+
prompt_cand_display = "Error: Prompt data missing in trial."
|
287
|
+
else:
|
288
|
+
logger.warning(f"Trial {trial.number}: 'param' object is not of expected type or lacks methods.")
|
289
|
+
prompt_cand_display = "Error: Invalid prompt data structure in trial."
|
290
|
+
|
291
|
+
score_val = trial.value # This can be None if trial failed to produce a score
|
292
|
+
duration_val = None
|
293
|
+
if trial.datetime_complete and trial.datetime_start:
|
294
|
+
duration_val = (trial.datetime_complete - trial.datetime_start).total_seconds()
|
295
|
+
|
296
|
+
iter_detail = {
|
297
|
+
"iteration": trial.number + 1,
|
298
|
+
"timestamp": trial.datetime_start.isoformat() if trial.datetime_start else datetime.now().isoformat(),
|
299
|
+
"prompt_candidate": prompt_cand_display,
|
300
|
+
"parameters_used": {
|
301
|
+
"optuna_params": trial.params,
|
302
|
+
"example_indices": trial.user_attrs.get("example_indices", []) # Default to empty list
|
303
|
+
},
|
304
|
+
"scores": [{
|
305
|
+
"metric_name": metric_config.metric.name,
|
306
|
+
"score": score_val, # Can be None
|
307
|
+
"opik_evaluation_id": None # TODO
|
308
|
+
}],
|
309
|
+
"tokens_used": None, # TODO
|
310
|
+
"cost": None, # TODO
|
311
|
+
"duration_seconds": duration_val,
|
312
|
+
}
|
313
|
+
optuna_history_processed.append(iter_detail)
|
314
|
+
else:
|
315
|
+
logger.warning(f"Skipping trial {trial.number} from history due to state: {trial.state}. Value: {trial.value}")
|
316
|
+
|
250
317
|
best_trial = study.best_trial
|
251
318
|
best_score = best_trial.value
|
252
319
|
best_n_examples = best_trial.params["n_examples"]
|
@@ -274,11 +341,13 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
274
341
|
"total_trials": n_trials,
|
275
342
|
"rounds": [],
|
276
343
|
"stopped_early": False,
|
277
|
-
"metric_config": metric_config.
|
278
|
-
"task_config": task_config.
|
344
|
+
"metric_config": metric_config.model_dump(),
|
345
|
+
"task_config": task_config.model_dump(),
|
279
346
|
"model": self.model,
|
280
347
|
"temperature": self.model_kwargs.get("temperature"),
|
281
348
|
},
|
349
|
+
history=optuna_history_processed,
|
350
|
+
llm_calls=self.llm_call_counter
|
282
351
|
)
|
283
352
|
|
284
353
|
def optimize_prompt(
|
@@ -295,6 +364,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
295
364
|
optimization = self._opik_client.create_optimization(
|
296
365
|
dataset_name=dataset.name,
|
297
366
|
objective_name=metric_config.metric.name,
|
367
|
+
metadata={"optimizer": self.__class__.__name__},
|
298
368
|
)
|
299
369
|
except Exception:
|
300
370
|
logger.warning(
|
@@ -389,6 +459,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
389
459
|
num_threads=self.n_threads,
|
390
460
|
project_name=self.project_name,
|
391
461
|
experiment_config=experiment_config,
|
462
|
+
verbose=self.verbose,
|
392
463
|
)
|
393
464
|
logger.debug(f"Evaluation score: {score:.4f}")
|
394
465
|
|
@@ -400,11 +471,11 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
400
471
|
def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
|
401
472
|
prompt_ = template.format(**dataset_item)
|
402
473
|
|
403
|
-
response = _call_model(
|
474
|
+
response = self._call_model(
|
404
475
|
model=self.model,
|
405
476
|
messages=prompt_,
|
406
477
|
seed=self.seed,
|
407
|
-
model_kwargs=self.model_kwargs
|
478
|
+
model_kwargs=self.model_kwargs
|
408
479
|
)
|
409
480
|
|
410
481
|
return {
|
opik_optimizer/logging_config.py
CHANGED
@@ -63,7 +63,7 @@ def setup_logging(
|
|
63
63
|
_logging_configured = True
|
64
64
|
|
65
65
|
# Use level name provided by rich handler by default
|
66
|
-
package_logger.info(f"Opik Optimizer logging configured to level: [bold cyan]{logging.getLevelName(level)}[/bold cyan]")
|
66
|
+
package_logger.info(f"Opik Agent Optimizer logging configured to level: [bold cyan]{logging.getLevelName(level)}[/bold cyan]")
|
67
67
|
|
68
68
|
# Ensure logger obtained after setup can be used immediately if needed
|
69
69
|
logger = logging.getLogger(__name__)
|
@@ -101,6 +101,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
101
101
|
adaptive_trial_threshold: Optional[float] = DEFAULT_ADAPTIVE_THRESHOLD,
|
102
102
|
num_threads: int = 12,
|
103
103
|
project_name: Optional[str] = None,
|
104
|
+
verbose: int = 1,
|
105
|
+
enable_context: bool = True,
|
104
106
|
**model_kwargs,
|
105
107
|
):
|
106
108
|
"""
|
@@ -117,6 +119,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
117
119
|
adaptive_trial_threshold: If not None, prompts scoring below `best_score * adaptive_trial_threshold` after initial trials won't get max trials.
|
118
120
|
num_threads: Number of threads for parallel evaluation
|
119
121
|
project_name: Optional project name for tracking
|
122
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
123
|
+
enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
|
120
124
|
**model_kwargs: Additional model parameters
|
121
125
|
"""
|
122
126
|
super().__init__(model=model, project_name=project_name, **model_kwargs)
|
@@ -128,9 +132,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
128
132
|
self.max_trials = max_trials_per_candidate
|
129
133
|
self.adaptive_threshold = adaptive_trial_threshold
|
130
134
|
self.num_threads = num_threads
|
135
|
+
self.verbose = verbose
|
131
136
|
self.dataset = None
|
132
137
|
self.task_config = None
|
133
138
|
self._opik_client = opik_client.get_client_cached()
|
139
|
+
self.llm_call_counter = 0
|
140
|
+
self.enable_context = enable_context
|
134
141
|
logger.debug(
|
135
142
|
f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
|
136
143
|
)
|
@@ -151,6 +158,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
151
158
|
experiment_config: Optional[Dict] = None,
|
152
159
|
n_samples: Optional[int] = None,
|
153
160
|
optimization_id: Optional[str] = None,
|
161
|
+
verbose: int = 1,
|
154
162
|
) -> float:
|
155
163
|
"""
|
156
164
|
Evaluate a prompt using the given dataset and metric configuration.
|
@@ -177,6 +185,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
177
185
|
experiment_config=experiment_config,
|
178
186
|
n_samples=n_samples,
|
179
187
|
optimization_id=optimization_id,
|
188
|
+
verbose=self.verbose,
|
180
189
|
)
|
181
190
|
|
182
191
|
@_throttle.rate_limited(_rate_limiter)
|
@@ -188,12 +197,21 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
188
197
|
optimization_id: Optional[str] = None,
|
189
198
|
) -> str:
|
190
199
|
"""Call the model with the given prompt and return the response."""
|
200
|
+
self.llm_call_counter += 1
|
191
201
|
# Note: Basic retry logic could be added here using tenacity
|
192
202
|
try:
|
193
203
|
# Basic LLM parameters (e.g., temperature, max_tokens)
|
204
|
+
base_temperature = getattr(self, "temperature", 0.3)
|
205
|
+
base_max_tokens = getattr(self, "max_tokens", 1000)
|
206
|
+
|
207
|
+
# Use potentially different settings for reasoning calls
|
208
|
+
reasoning_temperature = base_temperature # Keep same temp unless specified otherwise
|
209
|
+
# Increase max_tokens for reasoning to ensure JSON fits, unless already high
|
210
|
+
reasoning_max_tokens = max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
|
211
|
+
|
194
212
|
llm_config_params = {
|
195
|
-
"temperature":
|
196
|
-
"max_tokens":
|
213
|
+
"temperature": reasoning_temperature if is_reasoning else base_temperature,
|
214
|
+
"max_tokens": reasoning_max_tokens,
|
197
215
|
"top_p": getattr(self, "top_p", 1.0),
|
198
216
|
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
199
217
|
"presence_penalty": getattr(self, "presence_penalty", 0.0),
|
@@ -242,7 +260,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
242
260
|
)
|
243
261
|
|
244
262
|
response = litellm.completion(
|
245
|
-
model=model_to_use,
|
263
|
+
model=model_to_use,
|
264
|
+
messages=messages,
|
265
|
+
num_retries=6,
|
266
|
+
**final_call_params
|
246
267
|
)
|
247
268
|
return response.choices[0].message.content
|
248
269
|
except litellm.exceptions.RateLimitError as e:
|
@@ -271,6 +292,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
271
292
|
experiment_config: Optional[Dict],
|
272
293
|
n_samples: Optional[int],
|
273
294
|
optimization_id: Optional[str] = None,
|
295
|
+
verbose: int = 1,
|
274
296
|
) -> float:
|
275
297
|
# Calculate subset size for trials
|
276
298
|
if not use_full_dataset:
|
@@ -429,6 +451,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
429
451
|
n_samples=subset_size, # Use subset_size for trials, None for full dataset
|
430
452
|
experiment_config=experiment_config,
|
431
453
|
optimization_id=optimization_id,
|
454
|
+
verbose=self.verbose,
|
432
455
|
)
|
433
456
|
logger.debug(f"Evaluation score: {score:.4f}")
|
434
457
|
return score
|
@@ -474,7 +497,9 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
474
497
|
optimization = None
|
475
498
|
try:
|
476
499
|
optimization = self._opik_client.create_optimization(
|
477
|
-
dataset_name=dataset.name,
|
500
|
+
dataset_name=dataset.name,
|
501
|
+
objective_name=metric_config.metric.name,
|
502
|
+
metadata={"optimizer": self.__class__.__name__},
|
478
503
|
)
|
479
504
|
logger.info(f"Created optimization with ID: {optimization.id}")
|
480
505
|
except Exception as e:
|
@@ -519,6 +544,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
519
544
|
self.auto_continue = auto_continue
|
520
545
|
self.dataset = dataset
|
521
546
|
self.task_config = task_config
|
547
|
+
self.llm_call_counter = 0 # Reset counter for run
|
522
548
|
|
523
549
|
current_prompt = task_config.instruction_prompt
|
524
550
|
experiment_config = experiment_config or {}
|
@@ -550,6 +576,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
550
576
|
n_samples=n_samples,
|
551
577
|
experiment_config=experiment_config,
|
552
578
|
use_full_dataset=n_samples is None,
|
579
|
+
verbose=self.verbose,
|
553
580
|
)
|
554
581
|
best_score = initial_score
|
555
582
|
best_prompt = current_prompt
|
@@ -617,6 +644,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
617
644
|
n_samples=n_samples,
|
618
645
|
use_full_dataset=False,
|
619
646
|
experiment_config=experiment_config,
|
647
|
+
verbose=self.verbose,
|
620
648
|
)
|
621
649
|
scores.append(score)
|
622
650
|
logger.debug(f"Trial {trial+1} score: {score:.4f}")
|
@@ -659,6 +687,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
659
687
|
n_samples=n_samples,
|
660
688
|
use_full_dataset=False,
|
661
689
|
experiment_config=experiment_config,
|
690
|
+
verbose=self.verbose,
|
662
691
|
)
|
663
692
|
scores.append(score)
|
664
693
|
logger.debug(
|
@@ -710,6 +739,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
710
739
|
experiment_config=experiment_config,
|
711
740
|
n_samples=n_samples,
|
712
741
|
use_full_dataset=n_samples is None,
|
742
|
+
verbose=self.verbose,
|
713
743
|
)
|
714
744
|
logger.info(
|
715
745
|
f"Final evaluation score for best candidate: {final_score_best_cand:.4f}"
|
@@ -749,7 +779,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
749
779
|
improvement,
|
750
780
|
)
|
751
781
|
rounds.append(round_data)
|
752
|
-
self._add_to_history(round_data.
|
782
|
+
self._add_to_history(round_data.model_dump())
|
753
783
|
|
754
784
|
if (
|
755
785
|
improvement < self.improvement_threshold and round_num > 0
|
@@ -867,8 +897,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
867
897
|
"rounds": rounds,
|
868
898
|
"total_rounds": len(rounds),
|
869
899
|
"stopped_early": stopped_early,
|
870
|
-
"metric_config": metric_config.
|
871
|
-
"task_config": task_config.
|
900
|
+
"metric_config": metric_config.model_dump(),
|
901
|
+
"task_config": task_config.model_dump(),
|
872
902
|
"model": self.model,
|
873
903
|
"temperature": self.model_kwargs.get("temperature"),
|
874
904
|
}
|
@@ -879,6 +909,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
879
909
|
score=best_score,
|
880
910
|
metric_name=metric_config.metric.name,
|
881
911
|
details=details,
|
912
|
+
llm_calls=self.llm_call_counter
|
882
913
|
)
|
883
914
|
|
884
915
|
def _get_task_context(self, metric_config: MetricConfig) -> str:
|
@@ -952,20 +983,35 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
952
983
|
|
953
984
|
# Pass single metric_config
|
954
985
|
history_context = self._build_history_context(previous_rounds)
|
955
|
-
|
986
|
+
task_context_str = ""
|
987
|
+
analysis_instruction = ""
|
988
|
+
metric_focus_instruction = ""
|
989
|
+
improvement_point_1 = ""
|
990
|
+
|
991
|
+
if self.enable_context:
|
992
|
+
task_context_str = self._get_task_context(metric_config=metric_config)
|
993
|
+
analysis_instruction = "Analyze the example provided (if any), the metric description (if any), and the history of scores."
|
994
|
+
metric_focus_instruction = f"Focus on improving the score for the metric: {metric_config.metric.name}."
|
995
|
+
improvement_point_1 = "1. Be more specific and clear about expectations based on the metric and task."
|
996
|
+
logger.debug("Task context and metric-specific instructions enabled for reasoning prompt.")
|
997
|
+
else:
|
998
|
+
analysis_instruction = "Analyze the history of scores and the current prompt\'s performance."
|
999
|
+
metric_focus_instruction = "Focus on generating diverse and effective prompt variations based on the history."
|
1000
|
+
improvement_point_1 = "1. Be more specific and clear about expectations based on the task."
|
1001
|
+
logger.debug("Task context and metric-specific instructions disabled for reasoning prompt.")
|
956
1002
|
|
957
1003
|
user_prompt = f"""Current prompt: {current_prompt}
|
958
1004
|
Current score: {best_score}
|
959
1005
|
{history_context}
|
960
|
-
{
|
1006
|
+
{task_context_str}
|
961
1007
|
|
962
|
-
|
1008
|
+
{analysis_instruction}
|
963
1009
|
Generate {self.num_prompts_per_round} improved versions of this prompt.
|
964
|
-
|
1010
|
+
{metric_focus_instruction}
|
965
1011
|
Each version should aim to:
|
966
|
-
|
967
|
-
2. Provide necessary context and constraints.
|
968
|
-
3. Guide the model to produce the desired output format suitable for the
|
1012
|
+
{improvement_point_1}
|
1013
|
+
2. Provide necessary context and constraints (if applicable, without relying on disabled external context).
|
1014
|
+
3. Guide the model to produce the desired output format suitable for the task.
|
969
1015
|
4. Remove ambiguity and unnecessary elements.
|
970
1016
|
5. Maintain conciseness while being complete.
|
971
1017
|
|