opik-optimizer 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +376 -19
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +80 -17
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +179 -39
- opik_optimizer/evolutionary_optimizer/llm_support.py +3 -1
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +17 -3
- opik_optimizer/evolutionary_optimizer/population_ops.py +5 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +47 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +12 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +65 -59
- opik_optimizer/gepa_optimizer/adapter.py +5 -3
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +163 -66
- opik_optimizer/mcp_utils/mcp_workflow.py +57 -3
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +75 -69
- opik_optimizer/mipro_optimizer/_lm.py +10 -3
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +1 -1
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +96 -21
- opik_optimizer/optimizable_agent.py +5 -0
- opik_optimizer/optimization_result.py +1 -0
- opik_optimizer/utils/core.py +56 -14
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/METADATA +96 -9
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/RECORD +27 -26
- /opik_optimizer/{colbert.py → utils/colbert.py} +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ from deap import creator as _creator
|
|
8
8
|
|
9
9
|
from . import prompts as evo_prompts
|
10
10
|
from . import reporting
|
11
|
+
from .mcp import EvolutionaryMCPContext, initialize_population_mcp
|
11
12
|
from ..optimization_config import chat_prompt
|
12
13
|
from .. import utils
|
13
14
|
|
@@ -22,6 +23,7 @@ class PopulationOps:
|
|
22
23
|
output_style_guidance: str
|
23
24
|
_call_model: Any
|
24
25
|
toolbox: Any
|
26
|
+
_mcp_context: EvolutionaryMCPContext | None
|
25
27
|
# Hints for mixin attributes provided by the primary optimizer class
|
26
28
|
_gens_since_pop_improvement: int
|
27
29
|
_best_primary_score_history: list[float]
|
@@ -39,6 +41,9 @@ class PopulationOps:
|
|
39
41
|
including some 'fresh start' prompts based purely on task description.
|
40
42
|
All generated prompts should aim to elicit answers matching self.output_style_guidance.
|
41
43
|
"""
|
44
|
+
mcp_context = getattr(self, "_mcp_context", None)
|
45
|
+
if mcp_context is not None:
|
46
|
+
return initialize_population_mcp(self, prompt, mcp_context)
|
42
47
|
with reporting.initializing_population(verbose=self.verbose) as init_pop_report:
|
43
48
|
init_pop_report.start(self.population_size)
|
44
49
|
|
@@ -206,6 +206,53 @@ def mutation_strategy_prompts(output_style_guidance: str | None) -> dict[str, st
|
|
206
206
|
}
|
207
207
|
|
208
208
|
|
209
|
+
# ---------------------------------------------------------------------------
|
210
|
+
# MCP prompts
|
211
|
+
# ---------------------------------------------------------------------------
|
212
|
+
|
213
|
+
|
214
|
+
def mcp_tool_rewrite_system_prompt() -> str:
|
215
|
+
return (
|
216
|
+
"You are an expert prompt engineer tasked with refining MCP tool descriptions. "
|
217
|
+
"Always respond with strictly valid JSON matching the requested schema."
|
218
|
+
)
|
219
|
+
|
220
|
+
|
221
|
+
def mcp_tool_rewrite_user_prompt(
|
222
|
+
*,
|
223
|
+
tool_name: str,
|
224
|
+
current_description: str,
|
225
|
+
tool_metadata_json: str,
|
226
|
+
num_variations: int,
|
227
|
+
) -> str:
|
228
|
+
current_description = current_description.strip() or "(no description provided)"
|
229
|
+
return f"""You are improving the description of the MCP tool `{tool_name}`.
|
230
|
+
|
231
|
+
Current description:
|
232
|
+
---
|
233
|
+
{current_description}
|
234
|
+
---
|
235
|
+
|
236
|
+
Tool metadata (JSON):
|
237
|
+
{tool_metadata_json}
|
238
|
+
|
239
|
+
Generate {num_variations} improved descriptions for this tool. Each description should:
|
240
|
+
- Clarify expected arguments and their semantics.
|
241
|
+
- Explain how the tool output should be used in the final response.
|
242
|
+
- Avoid changing the tool name or introducing unsupported behaviour.
|
243
|
+
|
244
|
+
Respond strictly as JSON of the form:
|
245
|
+
{{
|
246
|
+
"prompts": [
|
247
|
+
{{
|
248
|
+
"tool_description": "...",
|
249
|
+
"improvement_focus": "..."
|
250
|
+
}}
|
251
|
+
]
|
252
|
+
}}
|
253
|
+
"""
|
254
|
+
|
255
|
+
|
209
256
|
def semantic_mutation_user_prompt(
|
210
257
|
prompt_messages: list[dict[str, str]],
|
211
258
|
task_description: str,
|
@@ -20,6 +20,18 @@ PANEL_WIDTH = 70
|
|
20
20
|
console = get_console()
|
21
21
|
|
22
22
|
|
23
|
+
def display_tool_description(description: str, title: str, style: str) -> None:
|
24
|
+
panel = Panel(
|
25
|
+
Text(description),
|
26
|
+
title=title,
|
27
|
+
title_align="left",
|
28
|
+
border_style=style,
|
29
|
+
width=PANEL_WIDTH,
|
30
|
+
padding=(1, 2),
|
31
|
+
)
|
32
|
+
console.print(panel)
|
33
|
+
|
34
|
+
|
23
35
|
@contextmanager
|
24
36
|
def infer_output_style(verbose: int = 1) -> Any:
|
25
37
|
class Reporter:
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Any
|
2
2
|
from collections.abc import Callable
|
3
|
+
import warnings
|
3
4
|
|
5
|
+
import copy
|
4
6
|
import json
|
5
7
|
import logging
|
6
8
|
import random
|
@@ -16,7 +18,6 @@ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
|
16
18
|
from pydantic import BaseModel
|
17
19
|
|
18
20
|
from opik_optimizer import base_optimizer
|
19
|
-
from ..utils import create_litellm_agent_class
|
20
21
|
from ..optimization_config import chat_prompt, mappers
|
21
22
|
from ..optimizable_agent import OptimizableAgent
|
22
23
|
from .. import _throttle, optimization_result, task_evaluator, utils
|
@@ -95,8 +96,11 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
95
96
|
**model_kwargs: Additional model parameters
|
96
97
|
"""
|
97
98
|
if "project_name" in model_kwargs:
|
98
|
-
|
99
|
-
"
|
99
|
+
warnings.warn(
|
100
|
+
"The 'project_name' parameter in optimizer constructor is deprecated. "
|
101
|
+
"Set project_name in the ChatPrompt instead.",
|
102
|
+
DeprecationWarning,
|
103
|
+
stacklevel=2,
|
100
104
|
)
|
101
105
|
del model_kwargs["project_name"]
|
102
106
|
|
@@ -112,10 +116,14 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
112
116
|
elif self.verbose == 2:
|
113
117
|
logger.setLevel(logging.DEBUG)
|
114
118
|
|
115
|
-
self._opik_client = opik.Opik()
|
116
|
-
self.llm_call_counter = 0
|
117
119
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
118
120
|
|
121
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
122
|
+
return {
|
123
|
+
"min_examples": self.min_examples,
|
124
|
+
"max_examples": self.max_examples,
|
125
|
+
}
|
126
|
+
|
119
127
|
@_throttle.rate_limited(_limiter)
|
120
128
|
def _call_model(
|
121
129
|
self,
|
@@ -134,7 +142,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
134
142
|
Returns:
|
135
143
|
Dict containing the model's response
|
136
144
|
"""
|
137
|
-
self.
|
145
|
+
self.increment_llm_counter()
|
138
146
|
|
139
147
|
current_model_kwargs = self.model_kwargs.copy()
|
140
148
|
current_model_kwargs.update(model_kwargs)
|
@@ -260,19 +268,20 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
260
268
|
if n_samples is not None and n_samples < len(dataset_items):
|
261
269
|
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
262
270
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
271
|
+
configuration_updates = self._drop_none(
|
272
|
+
{
|
273
|
+
"n_trials": n_trials,
|
274
|
+
"n_samples": n_samples,
|
275
|
+
"baseline_score": baseline_score,
|
276
|
+
}
|
277
|
+
)
|
278
|
+
base_experiment_config = self._prepare_experiment_config(
|
279
|
+
prompt=prompt,
|
280
|
+
dataset=dataset,
|
281
|
+
metric=metric,
|
282
|
+
experiment_config=experiment_config,
|
283
|
+
configuration_updates=configuration_updates,
|
284
|
+
)
|
276
285
|
|
277
286
|
# Start Optuna Study
|
278
287
|
def optimization_objective(trial: optuna.Trial) -> float:
|
@@ -327,7 +336,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
327
336
|
]
|
328
337
|
|
329
338
|
# Log trial config
|
330
|
-
trial_config =
|
339
|
+
trial_config = copy.deepcopy(base_experiment_config)
|
331
340
|
trial_config["configuration"]["prompt"] = (
|
332
341
|
messages_for_reporting # Base instruction
|
333
342
|
)
|
@@ -481,6 +490,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
481
490
|
},
|
482
491
|
history=optuna_history_processed,
|
483
492
|
llm_calls=self.llm_call_counter,
|
493
|
+
tool_calls=self.tool_call_counter,
|
484
494
|
dataset_id=dataset.id,
|
485
495
|
optimization_id=optimization_id,
|
486
496
|
)
|
@@ -490,47 +500,39 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
490
500
|
prompt: chat_prompt.ChatPrompt,
|
491
501
|
dataset: Dataset,
|
492
502
|
metric: Callable,
|
493
|
-
n_trials: int = 10,
|
494
|
-
agent_class: type[OptimizableAgent] | None = None,
|
495
503
|
experiment_config: dict | None = None,
|
496
504
|
n_samples: int | None = None,
|
505
|
+
auto_continue: bool = False,
|
506
|
+
agent_class: type[OptimizableAgent] | None = None,
|
507
|
+
**kwargs: Any,
|
497
508
|
) -> optimization_result.OptimizationResult:
|
498
509
|
"""
|
499
510
|
Args:
|
500
|
-
prompt:
|
511
|
+
prompt: The prompt to optimize
|
501
512
|
dataset: Opik Dataset to optimize on
|
502
513
|
metric: Metric function to evaluate on
|
503
|
-
n_trials: Number of trials for Bayesian Optimization
|
504
514
|
experiment_config: Optional configuration for the experiment, useful to log additional metadata
|
505
515
|
n_samples: Optional number of items to test in the dataset
|
516
|
+
auto_continue: Whether to auto-continue optimization
|
517
|
+
agent_class: Optional agent class to use
|
518
|
+
**kwargs: Additional parameters including:
|
519
|
+
n_trials (int): Number of trials for Bayesian Optimization (default: 10)
|
520
|
+
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
506
521
|
|
507
522
|
Returns:
|
508
523
|
OptimizationResult: Result of the optimization
|
509
524
|
"""
|
510
|
-
|
511
|
-
|
525
|
+
# Use base class validation and setup methods
|
526
|
+
self.validate_optimization_inputs(prompt, dataset, metric)
|
527
|
+
self.configure_prompt_model(prompt)
|
528
|
+
self.agent_class = self.setup_agent_class(prompt, agent_class)
|
512
529
|
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
if not callable(metric):
|
517
|
-
raise ValueError(
|
518
|
-
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
519
|
-
)
|
520
|
-
|
521
|
-
if prompt.model is None:
|
522
|
-
prompt.model = self.model
|
523
|
-
if prompt.model_kwargs is None:
|
524
|
-
prompt.model_kwargs = self.model_kwargs
|
525
|
-
|
526
|
-
if agent_class is None:
|
527
|
-
self.agent_class = create_litellm_agent_class(prompt)
|
528
|
-
else:
|
529
|
-
self.agent_class = agent_class
|
530
|
+
# Extract n_trials from kwargs for backward compatibility
|
531
|
+
n_trials = kwargs.get("n_trials", 10)
|
530
532
|
|
531
533
|
optimization = None
|
532
534
|
try:
|
533
|
-
optimization = self.
|
535
|
+
optimization = self.opik_client.create_optimization(
|
534
536
|
dataset_name=dataset.name,
|
535
537
|
objective_name=metric.__name__,
|
536
538
|
metadata={"optimizer": self.__class__.__name__},
|
@@ -636,20 +638,6 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
636
638
|
"""
|
637
639
|
llm_task = self._build_task_from_messages(prompt, prompt.get_messages())
|
638
640
|
|
639
|
-
experiment_config = experiment_config or {}
|
640
|
-
experiment_config["project_name"] = self.agent_class.__name__
|
641
|
-
experiment_config = {
|
642
|
-
**experiment_config,
|
643
|
-
**{
|
644
|
-
"optimizer": self.__class__.__name__,
|
645
|
-
"agent_class": self.agent_class.__name__,
|
646
|
-
"agent_config": prompt.to_dict(),
|
647
|
-
"metric": metric.__name__,
|
648
|
-
"dataset": dataset.name,
|
649
|
-
"configuration": {"prompt": prompt.get_messages()},
|
650
|
-
},
|
651
|
-
}
|
652
|
-
|
653
641
|
if n_samples is not None:
|
654
642
|
if dataset_item_ids is not None:
|
655
643
|
raise Exception("Can't use n_samples and dataset_item_ids")
|
@@ -657,6 +645,24 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
657
645
|
all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
|
658
646
|
dataset_item_ids = random.sample(all_ids, n_samples)
|
659
647
|
|
648
|
+
configuration_updates = self._drop_none(
|
649
|
+
{
|
650
|
+
"n_samples": n_samples,
|
651
|
+
"dataset_item_ids": dataset_item_ids,
|
652
|
+
}
|
653
|
+
)
|
654
|
+
additional_metadata = (
|
655
|
+
{"optimization_id": optimization_id} if optimization_id else None
|
656
|
+
)
|
657
|
+
experiment_config = self._prepare_experiment_config(
|
658
|
+
prompt=prompt,
|
659
|
+
dataset=dataset,
|
660
|
+
metric=metric,
|
661
|
+
experiment_config=experiment_config,
|
662
|
+
configuration_updates=configuration_updates,
|
663
|
+
additional_metadata=additional_metadata,
|
664
|
+
)
|
665
|
+
|
660
666
|
logger.debug("Starting FewShotBayesian evaluation...")
|
661
667
|
score = task_evaluator.evaluate(
|
662
668
|
dataset=dataset,
|
@@ -664,7 +670,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
664
670
|
metric=metric,
|
665
671
|
evaluated_task=llm_task,
|
666
672
|
num_threads=self.n_threads,
|
667
|
-
project_name=
|
673
|
+
project_name=experiment_config.get("project_name"),
|
668
674
|
experiment_config=experiment_config,
|
669
675
|
optimization_id=optimization_id,
|
670
676
|
verbose=self.verbose,
|
@@ -12,7 +12,7 @@ from ..optimization_config import chat_prompt
|
|
12
12
|
from ..utils import create_litellm_agent_class
|
13
13
|
|
14
14
|
|
15
|
-
|
15
|
+
logger = logging.getLogger(__name__)
|
16
16
|
|
17
17
|
|
18
18
|
@dataclass
|
@@ -77,7 +77,9 @@ class OpikGEPAAdapter(GEPAAdapter[OpikDataInst, dict[str, Any], dict[str, Any]])
|
|
77
77
|
system_text = _extract_system_text(candidate, self._system_fallback)
|
78
78
|
prompt_variant = _apply_system_text(self._base_prompt, system_text)
|
79
79
|
|
80
|
-
agent_class = create_litellm_agent_class(
|
80
|
+
agent_class = create_litellm_agent_class(
|
81
|
+
prompt_variant, optimizer_ref=self._optimizer
|
82
|
+
)
|
81
83
|
agent = agent_class(prompt_variant)
|
82
84
|
|
83
85
|
outputs: list[dict[str, Any]] = []
|
@@ -144,7 +146,7 @@ class OpikGEPAAdapter(GEPAAdapter[OpikDataInst, dict[str, Any], dict[str, Any]])
|
|
144
146
|
|
145
147
|
reflective_records = list(_records())
|
146
148
|
if not reflective_records:
|
147
|
-
|
149
|
+
logger.debug(
|
148
150
|
"No trajectories captured for candidate; returning empty reflective dataset"
|
149
151
|
)
|
150
152
|
reflective_records = []
|