opik-optimizer 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +0 -2
- opik_optimizer/base_optimizer.py +314 -145
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
- opik_optimizer/gepa_optimizer/reporting.py +164 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +42 -15
- opik_optimizer/utils/core.py +16 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +34 -35
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,6 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import textwrap
|
|
6
|
-
import warnings
|
|
7
6
|
from typing import Any, cast
|
|
8
7
|
from collections.abc import Callable
|
|
9
8
|
|
|
@@ -11,9 +10,8 @@ import litellm
|
|
|
11
10
|
import opik
|
|
12
11
|
from litellm.caching import Cache
|
|
13
12
|
from litellm.types.caching import LiteLLMCacheType
|
|
14
|
-
from opik import Dataset
|
|
13
|
+
from opik import Dataset, opik_context
|
|
15
14
|
from opik.environment import get_tqdm_for_current_environment
|
|
16
|
-
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
|
17
15
|
|
|
18
16
|
from opik_optimizer import task_evaluator
|
|
19
17
|
|
|
@@ -89,10 +87,32 @@ def _sync_tool_description_in_system(prompt: chat_prompt.ChatPrompt) -> None:
|
|
|
89
87
|
|
|
90
88
|
class MetaPromptOptimizer(BaseOptimizer):
|
|
91
89
|
"""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
This
|
|
95
|
-
|
|
90
|
+
Meta-Prompt Optimizer that uses LLM-based meta-reasoning to iteratively improve prompts.
|
|
91
|
+
|
|
92
|
+
This optimizer uses an LLM to analyze prompt performance and generate improved variations
|
|
93
|
+
by reasoning about what changes would be most effective. It's particularly useful for:
|
|
94
|
+
- Ensuring prompts follow best practices
|
|
95
|
+
- Refining prompts for clarity and effectiveness
|
|
96
|
+
- Optimizing prompts for specific evaluation metrics
|
|
97
|
+
- Improving prompts based on performance feedback
|
|
98
|
+
|
|
99
|
+
The optimizer works by:
|
|
100
|
+
1. Evaluating the current prompt on a dataset
|
|
101
|
+
2. Using an LLM to reason about improvements based on performance
|
|
102
|
+
3. Generating candidate prompt variations
|
|
103
|
+
4. Evaluating candidates and selecting the best
|
|
104
|
+
5. Repeating until max_trials is reached or performance plateaus
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
model: LiteLLM model name for optimizer's internal reasoning/generation calls
|
|
108
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
109
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p.
|
|
110
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
111
|
+
prompts_per_round: Number of candidate prompts to generate per optimization round
|
|
112
|
+
enable_context: Whether to include task-specific context when reasoning about improvements
|
|
113
|
+
n_threads: Number of parallel threads for prompt evaluation
|
|
114
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
115
|
+
seed: Random seed for reproducibility
|
|
96
116
|
"""
|
|
97
117
|
|
|
98
118
|
# --- Constants for Default Configuration ---
|
|
@@ -134,160 +154,30 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
134
154
|
|
|
135
155
|
def __init__(
|
|
136
156
|
self,
|
|
137
|
-
model: str,
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
|
|
141
|
-
num_threads: int | None = None,
|
|
142
|
-
verbose: int = 1,
|
|
157
|
+
model: str = "gpt-4o",
|
|
158
|
+
model_parameters: dict[str, Any] | None = None,
|
|
159
|
+
prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
|
|
143
160
|
enable_context: bool = True,
|
|
144
161
|
n_threads: int = 12,
|
|
162
|
+
verbose: int = 1,
|
|
145
163
|
seed: int = 42,
|
|
146
|
-
**model_kwargs: Any,
|
|
147
164
|
) -> None:
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
num_prompts_per_round: Number of prompts to generate per round
|
|
154
|
-
n_threads: Number of threads for parallel evaluation
|
|
155
|
-
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
|
156
|
-
enable_context: Whether to include task-specific context (metrics, examples) in the reasoning prompt.
|
|
157
|
-
**model_kwargs: Additional model parameters
|
|
158
|
-
"""
|
|
159
|
-
if "project_name" in model_kwargs:
|
|
160
|
-
warnings.warn(
|
|
161
|
-
"The 'project_name' parameter in optimizer constructor is deprecated. "
|
|
162
|
-
"Set project_name in the ChatPrompt instead.",
|
|
163
|
-
DeprecationWarning,
|
|
164
|
-
stacklevel=2,
|
|
165
|
-
)
|
|
166
|
-
del model_kwargs["project_name"]
|
|
167
|
-
|
|
168
|
-
super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
|
|
169
|
-
self.reasoning_model = reasoning_model if reasoning_model is not None else model
|
|
170
|
-
self.rounds = rounds
|
|
171
|
-
self.num_prompts_per_round = num_prompts_per_round
|
|
172
|
-
if num_threads is not None:
|
|
173
|
-
warnings.warn(
|
|
174
|
-
"The 'num_threads' parameter is deprecated and will be removed in a future version. "
|
|
175
|
-
"Use 'n_threads' instead.",
|
|
176
|
-
DeprecationWarning,
|
|
177
|
-
stacklevel=2,
|
|
178
|
-
)
|
|
179
|
-
n_threads = num_threads
|
|
180
|
-
self.num_threads = n_threads
|
|
165
|
+
super().__init__(
|
|
166
|
+
model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
|
|
167
|
+
)
|
|
168
|
+
self.prompts_per_round = prompts_per_round
|
|
169
|
+
self.n_threads = n_threads
|
|
181
170
|
self.dataset: Dataset | None = None
|
|
182
171
|
self.enable_context = enable_context
|
|
183
|
-
logger.debug(
|
|
184
|
-
|
|
185
|
-
)
|
|
186
|
-
logger.debug(
|
|
187
|
-
f"Optimization rounds: {rounds}, Prompts/round: {num_prompts_per_round}"
|
|
188
|
-
)
|
|
172
|
+
logger.debug(f"Initialized MetaPromptOptimizer with model={model}")
|
|
173
|
+
logger.debug(f"Prompts/round: {prompts_per_round}")
|
|
189
174
|
|
|
190
175
|
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
191
176
|
return {
|
|
192
|
-
"
|
|
193
|
-
"num_prompts_per_round": self.num_prompts_per_round,
|
|
194
|
-
"reasoning_model": self.reasoning_model,
|
|
177
|
+
"prompts_per_round": self.prompts_per_round,
|
|
195
178
|
"enable_context": self.enable_context,
|
|
196
179
|
}
|
|
197
180
|
|
|
198
|
-
@_throttle.rate_limited(_rate_limiter)
|
|
199
|
-
def _call_model(
|
|
200
|
-
self,
|
|
201
|
-
project_name: str,
|
|
202
|
-
messages: list[dict[str, str]],
|
|
203
|
-
is_reasoning: bool = False,
|
|
204
|
-
optimization_id: str | None = None,
|
|
205
|
-
) -> str:
|
|
206
|
-
"""Call the model with the given prompt and return the response."""
|
|
207
|
-
self.increment_llm_counter()
|
|
208
|
-
# Note: Basic retry logic could be added here using tenacity
|
|
209
|
-
try:
|
|
210
|
-
# Basic LLM parameters (e.g., temperature, max_tokens)
|
|
211
|
-
base_temperature = getattr(self, "temperature", 0.3)
|
|
212
|
-
base_max_tokens = getattr(self, "max_tokens", 1000)
|
|
213
|
-
|
|
214
|
-
# Use potentially different settings for reasoning calls
|
|
215
|
-
reasoning_temperature = (
|
|
216
|
-
base_temperature # Keep same temp unless specified otherwise
|
|
217
|
-
)
|
|
218
|
-
# Increase max_tokens for reasoning to ensure JSON fits, unless already high
|
|
219
|
-
reasoning_max_tokens = (
|
|
220
|
-
max(base_max_tokens, 3000) if is_reasoning else base_max_tokens
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
llm_config_params = {
|
|
224
|
-
"temperature": (
|
|
225
|
-
reasoning_temperature if is_reasoning else base_temperature
|
|
226
|
-
),
|
|
227
|
-
"max_tokens": reasoning_max_tokens,
|
|
228
|
-
"top_p": getattr(self, "top_p", 1.0),
|
|
229
|
-
"frequency_penalty": getattr(self, "frequency_penalty", 0.0),
|
|
230
|
-
"presence_penalty": getattr(self, "presence_penalty", 0.0),
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
# Prepare metadata that we want to be part of the LLM call context.
|
|
234
|
-
metadata_for_opik: dict[str, Any] = {}
|
|
235
|
-
if project_name:
|
|
236
|
-
metadata_for_opik["project_name"] = (
|
|
237
|
-
project_name # Top-level for general use
|
|
238
|
-
)
|
|
239
|
-
metadata_for_opik["opik"] = {"project_name": project_name}
|
|
240
|
-
|
|
241
|
-
if optimization_id:
|
|
242
|
-
# Also add to opik-specific structure if project_name was added
|
|
243
|
-
if "opik" in metadata_for_opik:
|
|
244
|
-
metadata_for_opik["opik"]["optimization_id"] = optimization_id
|
|
245
|
-
|
|
246
|
-
metadata_for_opik["optimizer_name"] = self.__class__.__name__
|
|
247
|
-
metadata_for_opik["opik_call_type"] = (
|
|
248
|
-
"reasoning" if is_reasoning else "evaluation_llm_task_direct"
|
|
249
|
-
)
|
|
250
|
-
|
|
251
|
-
if metadata_for_opik:
|
|
252
|
-
llm_config_params["metadata"] = metadata_for_opik
|
|
253
|
-
|
|
254
|
-
model_to_use = self.reasoning_model if is_reasoning else self.model
|
|
255
|
-
|
|
256
|
-
# Pass llm_config_params (which now includes our metadata) to the Opik monitor.
|
|
257
|
-
# The monitor is expected to return a dictionary suitable for spreading into litellm.completion,
|
|
258
|
-
# having handled our metadata and added any Opik-specific configurations.
|
|
259
|
-
final_call_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
|
|
260
|
-
llm_config_params.copy()
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
logger.debug(
|
|
264
|
-
f"Calling model '{model_to_use}' with messages: {messages}, "
|
|
265
|
-
f"final params for litellm (from monitor): {final_call_params}"
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
response = litellm.completion(
|
|
269
|
-
model=model_to_use,
|
|
270
|
-
messages=messages,
|
|
271
|
-
num_retries=6,
|
|
272
|
-
**final_call_params,
|
|
273
|
-
)
|
|
274
|
-
return response.choices[0].message.content
|
|
275
|
-
except litellm.exceptions.RateLimitError as e:
|
|
276
|
-
logger.error(f"LiteLLM Rate Limit Error: {e}")
|
|
277
|
-
raise
|
|
278
|
-
except litellm.exceptions.APIConnectionError as e:
|
|
279
|
-
logger.error(f"LiteLLM API Connection Error: {e}")
|
|
280
|
-
raise
|
|
281
|
-
except litellm.exceptions.ContextWindowExceededError as e:
|
|
282
|
-
logger.error(f"LiteLLM Context Window Exceeded Error: {e}")
|
|
283
|
-
# Log prompt length if possible? Needs access to prompt_for_llm here.
|
|
284
|
-
raise
|
|
285
|
-
except Exception:
|
|
286
|
-
# logger.error(
|
|
287
|
-
# f"Error calling model '{model_to_use}': {type(e).__name__} - {e}"
|
|
288
|
-
# )
|
|
289
|
-
raise
|
|
290
|
-
|
|
291
181
|
def _evaluate_prompt(
|
|
292
182
|
self,
|
|
293
183
|
prompt: chat_prompt.ChatPrompt,
|
|
@@ -433,6 +323,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
433
323
|
|
|
434
324
|
cleaned_model_output = raw_model_output.strip()
|
|
435
325
|
|
|
326
|
+
# Add tags to trace for optimization tracking
|
|
327
|
+
if self.current_optimization_id:
|
|
328
|
+
opik_context.update_current_trace(
|
|
329
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
330
|
+
)
|
|
331
|
+
|
|
436
332
|
result = {
|
|
437
333
|
mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
|
|
438
334
|
}
|
|
@@ -447,8 +343,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
447
343
|
metric=metric,
|
|
448
344
|
evaluated_task=llm_task,
|
|
449
345
|
dataset_item_ids=dataset_item_ids,
|
|
450
|
-
num_threads=self.
|
|
451
|
-
project_name=self.
|
|
346
|
+
num_threads=self.n_threads,
|
|
347
|
+
project_name=self.project_name,
|
|
452
348
|
n_samples=subset_size, # Use subset_size for trials, None for full dataset
|
|
453
349
|
experiment_config=experiment_config,
|
|
454
350
|
optimization_id=optimization_id,
|
|
@@ -466,35 +362,83 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
466
362
|
n_samples: int | None = None,
|
|
467
363
|
auto_continue: bool = False,
|
|
468
364
|
agent_class: type[OptimizableAgent] | None = None,
|
|
365
|
+
project_name: str = "Optimization",
|
|
366
|
+
max_trials: int = 10,
|
|
367
|
+
mcp_config: MCPExecutionConfig | None = None,
|
|
368
|
+
candidate_generator: Callable[..., list[chat_prompt.ChatPrompt]] | None = None,
|
|
369
|
+
candidate_generator_kwargs: dict[str, Any] | None = None,
|
|
370
|
+
*args: Any,
|
|
469
371
|
**kwargs: Any,
|
|
470
372
|
) -> OptimizationResult:
|
|
471
|
-
mcp_config = kwargs.pop("mcp_config", None)
|
|
472
|
-
candidate_generator = kwargs.pop("candidate_generator", None)
|
|
473
|
-
candidate_generator_kwargs = kwargs.pop("candidate_generator_kwargs", None)
|
|
474
|
-
|
|
475
373
|
"""
|
|
476
|
-
Optimize a prompt using meta-reasoning.
|
|
374
|
+
Optimize a prompt using LLM-based meta-reasoning to iteratively improve performance.
|
|
375
|
+
|
|
376
|
+
The optimizer evaluates the initial prompt, uses an LLM to reason about improvements,
|
|
377
|
+
generates candidate variations, and iteratively selects the best performers until
|
|
378
|
+
max_trials is reached.
|
|
477
379
|
|
|
478
380
|
Args:
|
|
479
|
-
prompt: The
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
381
|
+
prompt: The ChatPrompt to optimize. Can include system/user/assistant messages,
|
|
382
|
+
tools, and model configuration.
|
|
383
|
+
dataset: Opik Dataset containing evaluation examples. Each item is passed to the
|
|
384
|
+
prompt during evaluation.
|
|
385
|
+
metric: Evaluation function that takes (dataset_item, llm_output) and returns a
|
|
386
|
+
score (float). Higher scores indicate better performance.
|
|
387
|
+
experiment_config: Optional metadata dictionary to log with Opik experiments.
|
|
388
|
+
Useful for tracking experiment parameters and context.
|
|
389
|
+
n_samples: Number of dataset items to use per evaluation. If None, uses full dataset.
|
|
390
|
+
Lower values speed up optimization but may be less reliable.
|
|
391
|
+
auto_continue: If True, optimizer may continue beyond max_trials if improvements
|
|
392
|
+
are still being found.
|
|
393
|
+
agent_class: Custom agent class for prompt execution. If None, uses default
|
|
394
|
+
LiteLLM-based agent. Must inherit from OptimizableAgent.
|
|
395
|
+
project_name: Opik project name for logging traces and experiments. Default: "Optimization"
|
|
396
|
+
max_trials: Maximum total number of prompts to evaluate across all rounds.
|
|
397
|
+
Optimizer stops when this limit is reached.
|
|
398
|
+
mcp_config: Optional MCP (Model Context Protocol) execution configuration for
|
|
399
|
+
prompts that use external tools. Enables tool-calling workflows. Default: None
|
|
400
|
+
candidate_generator: Optional custom function to generate candidate prompts.
|
|
401
|
+
Overrides default meta-reasoning generator. Should return list[ChatPrompt].
|
|
402
|
+
candidate_generator_kwargs: Optional kwargs to pass to candidate_generator.
|
|
490
403
|
|
|
491
404
|
Returns:
|
|
492
|
-
OptimizationResult:
|
|
405
|
+
OptimizationResult: Contains the best prompt found, final score, optimization
|
|
406
|
+
history, and metadata about the optimization run.
|
|
407
|
+
|
|
408
|
+
Example:
|
|
409
|
+
```python
|
|
410
|
+
from opik_optimizer import MetaPromptOptimizer, ChatPrompt
|
|
411
|
+
from opik import Opik
|
|
412
|
+
|
|
413
|
+
client = Opik()
|
|
414
|
+
dataset = client.get_dataset("my_dataset")
|
|
415
|
+
|
|
416
|
+
prompt = ChatPrompt(
|
|
417
|
+
system="You are a helpful assistant.",
|
|
418
|
+
user_template="Answer this question: {question}"
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
def accuracy_metric(dataset_item, llm_output):
|
|
422
|
+
return 1.0 if llm_output == dataset_item["expected"] else 0.0
|
|
423
|
+
|
|
424
|
+
optimizer = MetaPromptOptimizer(model="gpt-4o")
|
|
425
|
+
result = optimizer.optimize_prompt(
|
|
426
|
+
prompt=prompt,
|
|
427
|
+
dataset=dataset,
|
|
428
|
+
metric=accuracy_metric,
|
|
429
|
+
max_trials=10
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
print(f"Best score: {result.best_score}")
|
|
433
|
+
print(f"Best prompt: {result.best_prompt}")
|
|
434
|
+
```
|
|
493
435
|
"""
|
|
494
436
|
# Use base class validation and setup methods
|
|
495
|
-
self.
|
|
496
|
-
self.
|
|
497
|
-
|
|
437
|
+
self._validate_optimization_inputs(prompt, dataset, metric)
|
|
438
|
+
self.agent_class = self._setup_agent_class(prompt, agent_class)
|
|
439
|
+
|
|
440
|
+
# Set project name from parameter
|
|
441
|
+
self.project_name = project_name
|
|
498
442
|
|
|
499
443
|
total_items = len(dataset.get_items())
|
|
500
444
|
if n_samples is not None and n_samples > total_items:
|
|
@@ -510,12 +454,14 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
510
454
|
objective_name=getattr(metric, "__name__", str(metric)),
|
|
511
455
|
metadata={"optimizer": self.__class__.__name__},
|
|
512
456
|
)
|
|
457
|
+
self.current_optimization_id = optimization.id
|
|
513
458
|
logger.debug(f"Created optimization with ID: {optimization.id}")
|
|
514
459
|
except Exception as e:
|
|
515
460
|
logger.warning(
|
|
516
461
|
f"Opik server does not support optimizations: {e}. Please upgrade opik."
|
|
517
462
|
)
|
|
518
463
|
optimization = None
|
|
464
|
+
self.current_optimization_id = None
|
|
519
465
|
|
|
520
466
|
reporting.display_header(
|
|
521
467
|
algorithm=self.__class__.__name__,
|
|
@@ -527,6 +473,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
527
473
|
messages=prompt.get_messages(),
|
|
528
474
|
optimizer_config={
|
|
529
475
|
"optimizer": self.__class__.__name__,
|
|
476
|
+
"max_trials": max_trials,
|
|
477
|
+
"prompts_per_round": self.prompts_per_round,
|
|
530
478
|
"n_samples": n_samples,
|
|
531
479
|
"auto_continue": auto_continue,
|
|
532
480
|
},
|
|
@@ -542,21 +490,21 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
542
490
|
dataset=dataset,
|
|
543
491
|
metric=metric,
|
|
544
492
|
experiment_config=experiment_config,
|
|
493
|
+
max_trials=max_trials,
|
|
545
494
|
n_samples=n_samples,
|
|
546
495
|
auto_continue=auto_continue,
|
|
547
496
|
mcp_config=mcp_config,
|
|
548
497
|
candidate_generator=candidate_generator,
|
|
549
498
|
candidate_generator_kwargs=candidate_generator_kwargs,
|
|
550
|
-
**kwargs,
|
|
551
499
|
)
|
|
552
500
|
if optimization:
|
|
553
|
-
self.
|
|
501
|
+
self._update_optimization(optimization, status="completed")
|
|
554
502
|
logger.debug("Optimization completed successfully")
|
|
555
503
|
return result
|
|
556
504
|
except Exception as e:
|
|
557
505
|
logger.error(f"Optimization failed: {e}")
|
|
558
506
|
if optimization:
|
|
559
|
-
self.
|
|
507
|
+
self._update_optimization(optimization, status="cancelled")
|
|
560
508
|
logger.debug("Optimization marked as cancelled")
|
|
561
509
|
raise e
|
|
562
510
|
|
|
@@ -601,14 +549,15 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
601
549
|
if tool_segment_id not in {segment.segment_id for segment in segments}:
|
|
602
550
|
raise ValueError(f"Tool '{tool_name}' not present in prompt tools")
|
|
603
551
|
|
|
604
|
-
return self.
|
|
552
|
+
return self._optimize_prompt(
|
|
553
|
+
optimization_id=None,
|
|
605
554
|
prompt=prompt,
|
|
606
555
|
dataset=dataset,
|
|
607
556
|
metric=metric,
|
|
608
557
|
experiment_config=experiment_config,
|
|
558
|
+
max_trials=10,
|
|
609
559
|
n_samples=n_samples,
|
|
610
560
|
auto_continue=auto_continue,
|
|
611
|
-
agent_class=agent_class,
|
|
612
561
|
mcp_config=mcp_config,
|
|
613
562
|
candidate_generator=self._generate_mcp_candidate_prompts,
|
|
614
563
|
candidate_generator_kwargs={
|
|
@@ -617,7 +566,6 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
617
566
|
"panel_style": panel_style,
|
|
618
567
|
},
|
|
619
568
|
tool_panel_style=panel_style,
|
|
620
|
-
**kwargs,
|
|
621
569
|
)
|
|
622
570
|
|
|
623
571
|
def _optimize_prompt(
|
|
@@ -627,26 +575,25 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
627
575
|
dataset: Dataset,
|
|
628
576
|
metric: Callable,
|
|
629
577
|
experiment_config: dict | None,
|
|
578
|
+
max_trials: int,
|
|
630
579
|
n_samples: int | None,
|
|
631
580
|
auto_continue: bool,
|
|
632
581
|
mcp_config: MCPExecutionConfig | None = None,
|
|
633
|
-
candidate_generator: None
|
|
634
|
-
| (Callable[..., list[chat_prompt.ChatPrompt]]) = None,
|
|
582
|
+
candidate_generator: Callable[..., list[chat_prompt.ChatPrompt]] | None = None,
|
|
635
583
|
candidate_generator_kwargs: dict[str, Any] | None = None,
|
|
636
584
|
tool_panel_style: str = "bright_magenta",
|
|
637
|
-
**kwargs: Any,
|
|
638
585
|
) -> OptimizationResult:
|
|
639
586
|
self.auto_continue = auto_continue
|
|
640
587
|
self.dataset = dataset
|
|
641
588
|
self.prompt = prompt
|
|
642
|
-
self.
|
|
589
|
+
self._reset_counters() # Reset counters for run
|
|
643
590
|
initial_prompt = prompt
|
|
644
591
|
|
|
645
592
|
current_prompt = prompt
|
|
646
593
|
configuration_updates = self._drop_none(
|
|
647
594
|
{
|
|
648
|
-
"
|
|
649
|
-
"
|
|
595
|
+
"max_trials": max_trials,
|
|
596
|
+
"prompts_per_round": self.prompts_per_round,
|
|
650
597
|
}
|
|
651
598
|
)
|
|
652
599
|
meta_metadata = {"stage": "initial"}
|
|
@@ -678,20 +625,33 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
678
625
|
baseline_reporter.set_score(initial_score)
|
|
679
626
|
|
|
680
627
|
reporting.display_optimization_start_message(verbose=self.verbose)
|
|
628
|
+
|
|
629
|
+
# Calculate the maximum number of rounds, we will stop early if we hit the
|
|
630
|
+
# max_trials limit
|
|
631
|
+
estimated_rounds = max(1, max_trials // self.prompts_per_round + 1)
|
|
632
|
+
|
|
681
633
|
with reporting.display_round_progress(
|
|
682
|
-
|
|
634
|
+
estimated_rounds, verbose=self.verbose
|
|
683
635
|
) as round_reporter:
|
|
684
|
-
|
|
636
|
+
round_num = 0
|
|
637
|
+
trials_used = 0
|
|
638
|
+
|
|
639
|
+
while trials_used < max_trials:
|
|
685
640
|
round_reporter.round_start(round_num)
|
|
686
641
|
previous_best_score = best_score
|
|
687
642
|
|
|
643
|
+
# Calculate how many prompts to generate this round
|
|
644
|
+
prompts_this_round = min(
|
|
645
|
+
self.prompts_per_round, max_trials - trials_used
|
|
646
|
+
)
|
|
647
|
+
|
|
688
648
|
# Step 1. Create a set of candidate prompts
|
|
689
649
|
generator = candidate_generator or self._generate_candidate_prompts
|
|
690
650
|
generator_kwargs = dict(candidate_generator_kwargs or {})
|
|
691
651
|
|
|
692
652
|
try:
|
|
693
653
|
candidate_prompts = generator(
|
|
694
|
-
project_name=self.
|
|
654
|
+
project_name=self.project_name,
|
|
695
655
|
current_prompt=best_prompt,
|
|
696
656
|
best_score=best_score,
|
|
697
657
|
round_num=round_num,
|
|
@@ -700,8 +660,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
700
660
|
optimization_id=optimization_id,
|
|
701
661
|
**generator_kwargs,
|
|
702
662
|
)
|
|
663
|
+
# Limit to prompts_this_round
|
|
664
|
+
candidate_prompts = candidate_prompts[:prompts_this_round]
|
|
703
665
|
except Exception as e:
|
|
704
|
-
round_reporter.failed_to_generate(
|
|
666
|
+
round_reporter.failed_to_generate(prompts_this_round, e)
|
|
667
|
+
round_num += 1
|
|
705
668
|
continue
|
|
706
669
|
|
|
707
670
|
# Step 2. Score each candidate prompt
|
|
@@ -728,6 +691,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
728
691
|
)
|
|
729
692
|
|
|
730
693
|
eval_report.set_final_score(best_score, prompt_score)
|
|
694
|
+
trials_used += 1
|
|
731
695
|
except Exception:
|
|
732
696
|
logger.warning("Failed evaluating agent; continuing...")
|
|
733
697
|
prompt_score = 0
|
|
@@ -764,6 +728,9 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
764
728
|
best_score = best_cand_score_avg
|
|
765
729
|
best_prompt = best_candidate_this_round
|
|
766
730
|
|
|
731
|
+
# Increment counters
|
|
732
|
+
round_num += 1
|
|
733
|
+
|
|
767
734
|
if tool_panel_style and getattr(best_prompt, "tools", None):
|
|
768
735
|
description = (
|
|
769
736
|
best_prompt.tools[0].get("function", {}).get("description", "")
|
|
@@ -868,20 +835,13 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
868
835
|
"total_rounds": len(rounds),
|
|
869
836
|
"metric_name": getattr(metric, "__name__", str(metric)),
|
|
870
837
|
"model": self.model,
|
|
871
|
-
"temperature": self.
|
|
838
|
+
"temperature": self.model_parameters.get("temperature"),
|
|
872
839
|
}
|
|
873
840
|
|
|
874
841
|
if best_tools:
|
|
875
842
|
details["final_tools"] = best_tools
|
|
876
843
|
|
|
877
|
-
tool_prompts =
|
|
878
|
-
if best_tools:
|
|
879
|
-
tool_prompts = {
|
|
880
|
-
(tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
|
|
881
|
-
"function", {}
|
|
882
|
-
).get("description")
|
|
883
|
-
for idx, tool in enumerate(best_tools)
|
|
884
|
-
}
|
|
844
|
+
tool_prompts = self._extract_tool_prompts(best_tools)
|
|
885
845
|
|
|
886
846
|
return OptimizationResult(
|
|
887
847
|
optimizer=self.__class__.__name__,
|
|
@@ -936,7 +896,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
936
896
|
) -> list[chat_prompt.ChatPrompt]:
|
|
937
897
|
"""Generate candidate prompts using meta-prompting."""
|
|
938
898
|
with reporting.display_candidate_generation_report(
|
|
939
|
-
self.
|
|
899
|
+
self.prompts_per_round, verbose=self.verbose
|
|
940
900
|
) as candidate_generation_report:
|
|
941
901
|
logger.debug(f"\nGenerating candidate prompts for round {round_num + 1}")
|
|
942
902
|
logger.debug(f"Generating from prompt: {current_prompt.get_messages()}")
|
|
@@ -972,7 +932,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
972
932
|
{task_context_str}
|
|
973
933
|
|
|
974
934
|
{analysis_instruction}
|
|
975
|
-
Generate {self.
|
|
935
|
+
Generate {self.prompts_per_round} improved versions of this prompt.
|
|
976
936
|
{metric_focus_instruction}
|
|
977
937
|
Each version should aim to:
|
|
978
938
|
{improvement_point_1}
|
|
@@ -984,15 +944,24 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
984
944
|
Return a valid JSON array as specified."""
|
|
985
945
|
|
|
986
946
|
try:
|
|
987
|
-
#
|
|
947
|
+
# Prepare metadata for optimization algorithm call
|
|
948
|
+
metadata_for_call: dict[str, Any] = {}
|
|
949
|
+
if project_name:
|
|
950
|
+
metadata_for_call["project_name"] = project_name
|
|
951
|
+
metadata_for_call["opik"] = {"project_name": project_name}
|
|
952
|
+
if optimization_id and "opik" in metadata_for_call:
|
|
953
|
+
metadata_for_call["opik"]["optimization_id"] = optimization_id
|
|
954
|
+
metadata_for_call["optimizer_name"] = self.__class__.__name__
|
|
955
|
+
metadata_for_call["opik_call_type"] = "optimization_algorithm"
|
|
956
|
+
|
|
957
|
+
# Use _call_model for optimization algorithm
|
|
988
958
|
content = self._call_model(
|
|
989
|
-
project_name,
|
|
990
959
|
messages=[
|
|
991
960
|
{"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
|
|
992
961
|
{"role": "user", "content": user_prompt},
|
|
993
962
|
],
|
|
994
|
-
is_reasoning=True,
|
|
995
963
|
optimization_id=optimization_id,
|
|
964
|
+
metadata=metadata_for_call,
|
|
996
965
|
)
|
|
997
966
|
logger.debug(f"Raw response from reasoning model: {content}")
|
|
998
967
|
|
|
@@ -1056,6 +1025,8 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
1056
1025
|
chat_prompt.ChatPrompt(
|
|
1057
1026
|
system=item["prompt"][0]["content"],
|
|
1058
1027
|
user=user_text,
|
|
1028
|
+
tools=current_prompt.tools,
|
|
1029
|
+
function_map=current_prompt.function_map,
|
|
1059
1030
|
)
|
|
1060
1031
|
)
|
|
1061
1032
|
|
|
@@ -1125,7 +1096,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
1125
1096
|
Current best score: {best_score:.4f}
|
|
1126
1097
|
{history_context}
|
|
1127
1098
|
|
|
1128
|
-
Generate {self.
|
|
1099
|
+
Generate {self.prompts_per_round} improved descriptions for this tool.
|
|
1129
1100
|
Each description should clarify expected input arguments and set explicit expectations
|
|
1130
1101
|
for how the tool output must be used in the final response.
|
|
1131
1102
|
Avoid changing unrelated parts of the prompt. Focus only on the description text for `{tool_name}`.
|
|
@@ -1144,17 +1115,26 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
|
1144
1115
|
).strip()
|
|
1145
1116
|
|
|
1146
1117
|
with reporting.display_candidate_generation_report(
|
|
1147
|
-
self.
|
|
1118
|
+
self.prompts_per_round, verbose=self.verbose
|
|
1148
1119
|
) as candidate_generation_report:
|
|
1149
1120
|
try:
|
|
1121
|
+
# Prepare metadata for optimization algorithm call
|
|
1122
|
+
metadata_for_call_tools: dict[str, Any] = {}
|
|
1123
|
+
if project_name:
|
|
1124
|
+
metadata_for_call_tools["project_name"] = project_name
|
|
1125
|
+
metadata_for_call_tools["opik"] = {"project_name": project_name}
|
|
1126
|
+
if optimization_id and "opik" in metadata_for_call_tools:
|
|
1127
|
+
metadata_for_call_tools["opik"]["optimization_id"] = optimization_id
|
|
1128
|
+
metadata_for_call_tools["optimizer_name"] = self.__class__.__name__
|
|
1129
|
+
metadata_for_call_tools["opik_call_type"] = "optimization_algorithm"
|
|
1130
|
+
|
|
1150
1131
|
content = self._call_model(
|
|
1151
|
-
project_name,
|
|
1152
1132
|
messages=[
|
|
1153
1133
|
{"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
|
|
1154
1134
|
{"role": "user", "content": instruction},
|
|
1155
1135
|
],
|
|
1156
|
-
is_reasoning=True,
|
|
1157
1136
|
optimization_id=optimization_id,
|
|
1137
|
+
metadata=metadata_for_call_tools,
|
|
1158
1138
|
)
|
|
1159
1139
|
|
|
1160
1140
|
try:
|
|
@@ -6,12 +6,12 @@ from rich.panel import Panel
|
|
|
6
6
|
from rich.text import Text
|
|
7
7
|
|
|
8
8
|
from ..optimization_config import chat_prompt
|
|
9
|
-
from ..reporting_utils import (
|
|
9
|
+
from ..reporting_utils import ( # noqa: F401
|
|
10
10
|
convert_tqdm_to_rich,
|
|
11
|
-
display_configuration,
|
|
12
|
-
display_header,
|
|
11
|
+
display_configuration,
|
|
12
|
+
display_header,
|
|
13
13
|
display_messages,
|
|
14
|
-
display_result,
|
|
14
|
+
display_result,
|
|
15
15
|
get_console,
|
|
16
16
|
suppress_opik_logs,
|
|
17
17
|
)
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .mipro_optimizer_v2 import MIPROv2
|
|
2
2
|
|
|
3
|
-
__all__ = ["
|
|
3
|
+
__all__ = ["MIPROv2"]
|