opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +4 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +402 -28
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +154 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +30 -23
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +21 -16
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +22 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/utils/colbert.py +236 -0
- opik_optimizer/{utils.py → utils/core.py} +160 -33
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- opik_optimizer-2.0.0.dist-info/METADATA +345 -0
- opik_optimizer-2.0.0.dist-info/RECORD +74 -0
- opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.6.dist-info/METADATA +0 -181
- opik_optimizer-1.0.6.dist-info/RECORD +0 -50
- opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,21 @@
|
|
1
|
+
import copy
|
1
2
|
import json
|
2
3
|
import logging
|
3
4
|
import os
|
4
|
-
|
5
|
+
import textwrap
|
6
|
+
import warnings
|
7
|
+
from typing import Any, cast
|
8
|
+
from collections.abc import Callable
|
5
9
|
|
6
10
|
import litellm
|
7
11
|
import opik
|
8
12
|
from litellm.caching import Cache
|
9
13
|
from litellm.types.caching import LiteLLMCacheType
|
10
14
|
from opik import Dataset
|
11
|
-
from opik.api_objects import opik_client
|
12
15
|
from opik.environment import get_tqdm_for_current_environment
|
13
16
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
14
17
|
|
15
18
|
from opik_optimizer import task_evaluator
|
16
|
-
from opik_optimizer import utils
|
17
19
|
|
18
20
|
from .. import _throttle
|
19
21
|
from ..base_optimizer import BaseOptimizer, OptimizationRound
|
@@ -21,6 +23,15 @@ from ..optimization_config import chat_prompt, mappers
|
|
21
23
|
from ..optimization_result import OptimizationResult
|
22
24
|
from ..optimizable_agent import OptimizableAgent
|
23
25
|
from . import reporting
|
26
|
+
import re
|
27
|
+
|
28
|
+
from ..mcp_utils.mcp import PROMPT_TOOL_FOOTER, PROMPT_TOOL_HEADER
|
29
|
+
from ..mcp_utils.mcp_workflow import (
|
30
|
+
MCPExecutionConfig,
|
31
|
+
MCPSecondPassCoordinator,
|
32
|
+
extract_tool_arguments,
|
33
|
+
)
|
34
|
+
from ..utils.prompt_segments import apply_segment_updates, extract_prompt_segments
|
24
35
|
|
25
36
|
tqdm = get_tqdm_for_current_environment()
|
26
37
|
|
@@ -34,6 +45,48 @@ logger = logging.getLogger(__name__) # Gets logger configured by setup_logging
|
|
34
45
|
_rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
35
46
|
|
36
47
|
|
48
|
+
def _sync_tool_description_in_system(prompt: chat_prompt.ChatPrompt) -> None:
|
49
|
+
if not prompt.system or not getattr(prompt, "tools", None):
|
50
|
+
return
|
51
|
+
|
52
|
+
description = (
|
53
|
+
prompt.tools[0].get("function", {}).get("description") if prompt.tools else None
|
54
|
+
)
|
55
|
+
if not description:
|
56
|
+
return
|
57
|
+
|
58
|
+
tool_name = (
|
59
|
+
prompt.tools[0].get("function", {}).get("name") if prompt.tools else None
|
60
|
+
)
|
61
|
+
|
62
|
+
system_text = cast(str, prompt.system)
|
63
|
+
if PROMPT_TOOL_HEADER not in system_text or PROMPT_TOOL_FOOTER not in system_text:
|
64
|
+
return
|
65
|
+
|
66
|
+
start = system_text.index(PROMPT_TOOL_HEADER) + len(PROMPT_TOOL_HEADER)
|
67
|
+
end = system_text.index(PROMPT_TOOL_FOOTER)
|
68
|
+
description_text = description.strip()
|
69
|
+
system_text = (
|
70
|
+
system_text[:start] + "\n" + description_text + "\n" + system_text[end:]
|
71
|
+
)
|
72
|
+
prompt.system = system_text
|
73
|
+
|
74
|
+
if tool_name:
|
75
|
+
pattern = rf"(-\s*{re.escape(tool_name)}:\s)(.*)"
|
76
|
+
|
77
|
+
def _tool_section_replacer(match: re.Match[str]) -> str:
|
78
|
+
return f"{match.group(1)}{description_text}"
|
79
|
+
|
80
|
+
system_text = re.sub(
|
81
|
+
pattern,
|
82
|
+
_tool_section_replacer,
|
83
|
+
system_text,
|
84
|
+
count=1,
|
85
|
+
flags=re.MULTILINE,
|
86
|
+
)
|
87
|
+
prompt.system = system_text
|
88
|
+
|
89
|
+
|
37
90
|
class MetaPromptOptimizer(BaseOptimizer):
|
38
91
|
"""
|
39
92
|
The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
|
@@ -82,13 +135,14 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
82
135
|
def __init__(
|
83
136
|
self,
|
84
137
|
model: str,
|
85
|
-
reasoning_model:
|
138
|
+
reasoning_model: str | None = None,
|
86
139
|
rounds: int = DEFAULT_ROUNDS,
|
87
140
|
num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
|
88
|
-
num_threads:
|
141
|
+
num_threads: int | None = None,
|
89
142
|
verbose: int = 1,
|
90
143
|
enable_context: bool = True,
|
91
144
|
n_threads: int = 12,
|
145
|
+
seed: int = 42,
|
92
146
|
**model_kwargs: Any,
|
93
147
|
) -> None:
|
94
148
|
"""
|
@@ -103,22 +157,28 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
103
157
|
**model_kwargs: Additional model parameters
|
104
158
|
"""
|
105
159
|
if "project_name" in model_kwargs:
|
106
|
-
|
107
|
-
"
|
160
|
+
warnings.warn(
|
161
|
+
"The 'project_name' parameter in optimizer constructor is deprecated. "
|
162
|
+
"Set project_name in the ChatPrompt instead.",
|
163
|
+
DeprecationWarning,
|
164
|
+
stacklevel=2,
|
108
165
|
)
|
109
166
|
del model_kwargs["project_name"]
|
110
167
|
|
111
|
-
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
168
|
+
super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
|
112
169
|
self.reasoning_model = reasoning_model if reasoning_model is not None else model
|
113
170
|
self.rounds = rounds
|
114
171
|
self.num_prompts_per_round = num_prompts_per_round
|
115
172
|
if num_threads is not None:
|
116
|
-
|
173
|
+
warnings.warn(
|
174
|
+
"The 'num_threads' parameter is deprecated and will be removed in a future version. "
|
175
|
+
"Use 'n_threads' instead.",
|
176
|
+
DeprecationWarning,
|
177
|
+
stacklevel=2,
|
178
|
+
)
|
117
179
|
n_threads = num_threads
|
118
180
|
self.num_threads = n_threads
|
119
|
-
self.dataset:
|
120
|
-
self._opik_client = opik_client.get_client_cached()
|
121
|
-
self.llm_call_counter = 0
|
181
|
+
self.dataset: Dataset | None = None
|
122
182
|
self.enable_context = enable_context
|
123
183
|
logger.debug(
|
124
184
|
f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
|
@@ -127,16 +187,24 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
127
187
|
f"Optimization rounds: {rounds}, Prompts/round: {num_prompts_per_round}"
|
128
188
|
)
|
129
189
|
|
190
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
191
|
+
return {
|
192
|
+
"rounds": self.rounds,
|
193
|
+
"num_prompts_per_round": self.num_prompts_per_round,
|
194
|
+
"reasoning_model": self.reasoning_model,
|
195
|
+
"enable_context": self.enable_context,
|
196
|
+
}
|
197
|
+
|
130
198
|
@_throttle.rate_limited(_rate_limiter)
|
131
199
|
def _call_model(
|
132
200
|
self,
|
133
201
|
project_name: str,
|
134
|
-
messages:
|
202
|
+
messages: list[dict[str, str]],
|
135
203
|
is_reasoning: bool = False,
|
136
|
-
optimization_id:
|
204
|
+
optimization_id: str | None = None,
|
137
205
|
) -> str:
|
138
206
|
"""Call the model with the given prompt and return the response."""
|
139
|
-
self.
|
207
|
+
self.increment_llm_counter()
|
140
208
|
# Note: Basic retry logic could be added here using tenacity
|
141
209
|
try:
|
142
210
|
# Basic LLM parameters (e.g., temperature, max_tokens)
|
@@ -163,7 +231,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
163
231
|
}
|
164
232
|
|
165
233
|
# Prepare metadata that we want to be part of the LLM call context.
|
166
|
-
metadata_for_opik:
|
234
|
+
metadata_for_opik: dict[str, Any] = {}
|
167
235
|
if project_name:
|
168
236
|
metadata_for_opik["project_name"] = (
|
169
237
|
project_name # Top-level for general use
|
@@ -225,11 +293,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
225
293
|
prompt: chat_prompt.ChatPrompt,
|
226
294
|
dataset: opik.Dataset,
|
227
295
|
metric: Callable,
|
228
|
-
n_samples:
|
229
|
-
dataset_item_ids:
|
230
|
-
experiment_config:
|
296
|
+
n_samples: int | None = None,
|
297
|
+
dataset_item_ids: list[str] | None = None,
|
298
|
+
experiment_config: dict | None = None,
|
231
299
|
use_full_dataset: bool = True,
|
232
|
-
optimization_id:
|
300
|
+
optimization_id: str | None = None,
|
301
|
+
mcp_config: MCPExecutionConfig | None = None,
|
233
302
|
**kwargs: Any,
|
234
303
|
) -> float:
|
235
304
|
"""
|
@@ -266,58 +335,103 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
266
335
|
subset_size = None # Use all items for final checks
|
267
336
|
logger.debug("Using full dataset for evaluation")
|
268
337
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
"
|
278
|
-
"
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
# "content": item["content"].format(**dataset_item),
|
294
|
-
# }
|
295
|
-
# for item in prompt.get_messages()
|
296
|
-
# ]
|
297
|
-
# Step 1: create the agent
|
338
|
+
configuration_updates = self._drop_none(
|
339
|
+
{
|
340
|
+
"n_samples": subset_size,
|
341
|
+
"use_full_dataset": use_full_dataset,
|
342
|
+
}
|
343
|
+
)
|
344
|
+
meta_metadata = self._drop_none(
|
345
|
+
{
|
346
|
+
"optimization_id": optimization_id,
|
347
|
+
"stage": "trial_evaluation" if not use_full_dataset else "final_eval",
|
348
|
+
}
|
349
|
+
)
|
350
|
+
experiment_config = self._prepare_experiment_config(
|
351
|
+
prompt=prompt,
|
352
|
+
dataset=dataset,
|
353
|
+
metric=metric,
|
354
|
+
experiment_config=experiment_config,
|
355
|
+
configuration_updates=configuration_updates,
|
356
|
+
additional_metadata={"meta_prompt": meta_metadata}
|
357
|
+
if meta_metadata
|
358
|
+
else None,
|
359
|
+
)
|
360
|
+
|
361
|
+
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
298
362
|
new_prompt = prompt.copy()
|
299
363
|
messages = new_prompt.get_messages(dataset_item)
|
300
364
|
new_prompt.set_messages(messages)
|
301
365
|
agent = self.agent_class(new_prompt)
|
302
366
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
367
|
+
if mcp_config is not None:
|
368
|
+
coordinator = mcp_config.coordinator
|
369
|
+
coordinator.reset()
|
370
|
+
try:
|
371
|
+
logger.debug(
|
372
|
+
"Calling MCP-enabled LLM with tool access; prompt length=%s",
|
373
|
+
sum(len(msg["content"]) for msg in messages),
|
374
|
+
)
|
375
|
+
raw_model_output = agent.llm_invoke(
|
376
|
+
messages=messages,
|
377
|
+
seed=self.seed,
|
378
|
+
allow_tool_use=True,
|
379
|
+
)
|
380
|
+
except Exception as exc:
|
381
|
+
logger.error("Error during MCP first pass: %s", exc)
|
382
|
+
raise
|
383
|
+
|
384
|
+
second_pass_messages = coordinator.build_second_pass_messages(
|
385
|
+
base_messages=messages,
|
386
|
+
dataset_item=dataset_item,
|
316
387
|
)
|
317
|
-
raise
|
318
388
|
|
319
|
-
|
320
|
-
|
389
|
+
if second_pass_messages is None and mcp_config.fallback_invoker:
|
390
|
+
fallback_args = mcp_config.fallback_arguments(dataset_item)
|
391
|
+
if fallback_args:
|
392
|
+
logger.debug(
|
393
|
+
"MCP fallback triggered for tool %s with args=%s",
|
394
|
+
mcp_config.tool_name,
|
395
|
+
fallback_args,
|
396
|
+
)
|
397
|
+
summary_override = mcp_config.fallback_invoker(fallback_args)
|
398
|
+
second_pass_messages = coordinator.build_second_pass_messages(
|
399
|
+
base_messages=messages,
|
400
|
+
dataset_item=dataset_item,
|
401
|
+
summary_override=summary_override,
|
402
|
+
)
|
403
|
+
|
404
|
+
if second_pass_messages is not None:
|
405
|
+
logger.debug(
|
406
|
+
"Executing MCP second pass with %d messages",
|
407
|
+
len(second_pass_messages),
|
408
|
+
)
|
409
|
+
final_response = agent.llm_invoke(
|
410
|
+
messages=second_pass_messages,
|
411
|
+
seed=self.seed,
|
412
|
+
allow_tool_use=mcp_config.allow_tool_use_on_second_pass,
|
413
|
+
)
|
414
|
+
else:
|
415
|
+
final_response = raw_model_output
|
416
|
+
|
417
|
+
cleaned_model_output = final_response.strip()
|
418
|
+
else:
|
419
|
+
try:
|
420
|
+
logger.debug(
|
421
|
+
f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
|
422
|
+
)
|
423
|
+
raw_model_output = agent.invoke(messages)
|
424
|
+
logger.debug(f"LLM raw response length: {len(raw_model_output)}")
|
425
|
+
logger.debug(f"LLM raw output: {raw_model_output}")
|
426
|
+
except Exception as e:
|
427
|
+
logger.error(f"Error calling model with prompt: {e}")
|
428
|
+
logger.error(f"Failed prompt: {messages}")
|
429
|
+
logger.error(
|
430
|
+
f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
|
431
|
+
)
|
432
|
+
raise
|
433
|
+
|
434
|
+
cleaned_model_output = raw_model_output.strip()
|
321
435
|
|
322
436
|
result = {
|
323
437
|
mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
|
@@ -348,46 +462,39 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
348
462
|
prompt: chat_prompt.ChatPrompt,
|
349
463
|
dataset: Dataset,
|
350
464
|
metric: Callable,
|
351
|
-
experiment_config:
|
352
|
-
n_samples:
|
465
|
+
experiment_config: dict | None = None,
|
466
|
+
n_samples: int | None = None,
|
353
467
|
auto_continue: bool = False,
|
354
|
-
agent_class:
|
468
|
+
agent_class: type[OptimizableAgent] | None = None,
|
355
469
|
**kwargs: Any,
|
356
470
|
) -> OptimizationResult:
|
471
|
+
mcp_config = kwargs.pop("mcp_config", None)
|
472
|
+
candidate_generator = kwargs.pop("candidate_generator", None)
|
473
|
+
candidate_generator_kwargs = kwargs.pop("candidate_generator_kwargs", None)
|
474
|
+
|
357
475
|
"""
|
358
476
|
Optimize a prompt using meta-reasoning.
|
359
477
|
|
360
478
|
Args:
|
479
|
+
prompt: The prompt to optimize
|
361
480
|
dataset: The dataset to evaluate against
|
362
481
|
metric: The metric to use for evaluation
|
363
482
|
experiment_config: A dictionary to log with the experiments
|
364
483
|
n_samples: The number of dataset items to use for evaluation
|
365
484
|
auto_continue: If True, the algorithm may continue if goal not met
|
366
|
-
|
485
|
+
agent_class: Optional agent class to use
|
486
|
+
**kwargs: Additional arguments for evaluation, including:
|
487
|
+
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
488
|
+
candidate_generator: Optional candidate generator
|
489
|
+
candidate_generator_kwargs: Optional kwargs for candidate generator
|
367
490
|
|
368
491
|
Returns:
|
369
492
|
OptimizationResult: Structured result containing optimization details
|
370
493
|
"""
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
raise ValueError("Dataset must be a Dataset object")
|
376
|
-
|
377
|
-
if not callable(metric):
|
378
|
-
raise ValueError(
|
379
|
-
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
380
|
-
)
|
381
|
-
|
382
|
-
if prompt.model is None:
|
383
|
-
prompt.model = self.model
|
384
|
-
if prompt.model_kwargs is None:
|
385
|
-
prompt.model_kwargs = self.model_kwargs
|
386
|
-
|
387
|
-
if agent_class is None:
|
388
|
-
self.agent_class = utils.create_litellm_agent_class(prompt)
|
389
|
-
else:
|
390
|
-
self.agent_class = agent_class
|
494
|
+
# Use base class validation and setup methods
|
495
|
+
self.validate_optimization_inputs(prompt, dataset, metric)
|
496
|
+
self.configure_prompt_model(prompt)
|
497
|
+
self.agent_class = self.setup_agent_class(prompt, agent_class)
|
391
498
|
|
392
499
|
total_items = len(dataset.get_items())
|
393
500
|
if n_samples is not None and n_samples > total_items:
|
@@ -398,7 +505,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
398
505
|
|
399
506
|
optimization = None
|
400
507
|
try:
|
401
|
-
optimization = self.
|
508
|
+
optimization = self.opik_client.create_optimization(
|
402
509
|
dataset_name=dataset.name,
|
403
510
|
objective_name=getattr(metric, "__name__", str(metric)),
|
404
511
|
metadata={"optimizer": self.__class__.__name__},
|
@@ -424,6 +531,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
424
531
|
"auto_continue": auto_continue,
|
425
532
|
},
|
426
533
|
verbose=self.verbose,
|
534
|
+
tools=getattr(prompt, "tools", None),
|
427
535
|
)
|
428
536
|
|
429
537
|
try:
|
@@ -436,6 +544,9 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
436
544
|
experiment_config=experiment_config,
|
437
545
|
n_samples=n_samples,
|
438
546
|
auto_continue=auto_continue,
|
547
|
+
mcp_config=mcp_config,
|
548
|
+
candidate_generator=candidate_generator,
|
549
|
+
candidate_generator_kwargs=candidate_generator_kwargs,
|
439
550
|
**kwargs,
|
440
551
|
)
|
441
552
|
if optimization:
|
@@ -449,40 +560,104 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
449
560
|
logger.debug("Optimization marked as cancelled")
|
450
561
|
raise e
|
451
562
|
|
563
|
+
def optimize_mcp(
|
564
|
+
self,
|
565
|
+
prompt: chat_prompt.ChatPrompt,
|
566
|
+
dataset: Dataset,
|
567
|
+
metric: Callable,
|
568
|
+
*,
|
569
|
+
tool_name: str,
|
570
|
+
second_pass: MCPSecondPassCoordinator,
|
571
|
+
experiment_config: dict | None = None,
|
572
|
+
n_samples: int | None = None,
|
573
|
+
auto_continue: bool = False,
|
574
|
+
agent_class: type[OptimizableAgent] | None = None,
|
575
|
+
fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
|
576
|
+
fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
|
577
|
+
allow_tool_use_on_second_pass: bool = False,
|
578
|
+
**kwargs: Any,
|
579
|
+
) -> OptimizationResult:
|
580
|
+
panel_style = kwargs.pop("tool_panel_style", "bright_magenta")
|
581
|
+
|
582
|
+
if prompt.tools is None or not prompt.tools:
|
583
|
+
raise ValueError("Prompt must include tools for MCP optimization")
|
584
|
+
|
585
|
+
fallback_args_fn = fallback_arguments or extract_tool_arguments
|
586
|
+
|
587
|
+
if fallback_invoker is None:
|
588
|
+
function_map = prompt.function_map or {}
|
589
|
+
fallback_invoker = function_map.get(tool_name)
|
590
|
+
|
591
|
+
mcp_config = MCPExecutionConfig(
|
592
|
+
coordinator=second_pass,
|
593
|
+
tool_name=tool_name,
|
594
|
+
fallback_arguments=fallback_args_fn,
|
595
|
+
fallback_invoker=fallback_invoker,
|
596
|
+
allow_tool_use_on_second_pass=allow_tool_use_on_second_pass,
|
597
|
+
)
|
598
|
+
|
599
|
+
tool_segment_id = f"tool:{tool_name}"
|
600
|
+
segments = extract_prompt_segments(prompt)
|
601
|
+
if tool_segment_id not in {segment.segment_id for segment in segments}:
|
602
|
+
raise ValueError(f"Tool '{tool_name}' not present in prompt tools")
|
603
|
+
|
604
|
+
return self.optimize_prompt(
|
605
|
+
prompt=prompt,
|
606
|
+
dataset=dataset,
|
607
|
+
metric=metric,
|
608
|
+
experiment_config=experiment_config,
|
609
|
+
n_samples=n_samples,
|
610
|
+
auto_continue=auto_continue,
|
611
|
+
agent_class=agent_class,
|
612
|
+
mcp_config=mcp_config,
|
613
|
+
candidate_generator=self._generate_mcp_candidate_prompts,
|
614
|
+
candidate_generator_kwargs={
|
615
|
+
"tool_segment_id": tool_segment_id,
|
616
|
+
"tool_name": tool_name,
|
617
|
+
"panel_style": panel_style,
|
618
|
+
},
|
619
|
+
tool_panel_style=panel_style,
|
620
|
+
**kwargs,
|
621
|
+
)
|
622
|
+
|
452
623
|
def _optimize_prompt(
|
453
624
|
self,
|
454
|
-
optimization_id:
|
625
|
+
optimization_id: str | None,
|
455
626
|
prompt: chat_prompt.ChatPrompt,
|
456
627
|
dataset: Dataset,
|
457
628
|
metric: Callable,
|
458
|
-
experiment_config:
|
459
|
-
n_samples:
|
629
|
+
experiment_config: dict | None,
|
630
|
+
n_samples: int | None,
|
460
631
|
auto_continue: bool,
|
632
|
+
mcp_config: MCPExecutionConfig | None = None,
|
633
|
+
candidate_generator: None
|
634
|
+
| (Callable[..., list[chat_prompt.ChatPrompt]]) = None,
|
635
|
+
candidate_generator_kwargs: dict[str, Any] | None = None,
|
636
|
+
tool_panel_style: str = "bright_magenta",
|
461
637
|
**kwargs: Any,
|
462
638
|
) -> OptimizationResult:
|
463
639
|
self.auto_continue = auto_continue
|
464
640
|
self.dataset = dataset
|
465
641
|
self.prompt = prompt
|
466
|
-
self.
|
642
|
+
self.reset_counters() # Reset counters for run
|
467
643
|
initial_prompt = prompt
|
468
644
|
|
469
645
|
current_prompt = prompt
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
}
|
646
|
+
configuration_updates = self._drop_none(
|
647
|
+
{
|
648
|
+
"rounds": self.rounds,
|
649
|
+
"num_prompts_per_round": self.num_prompts_per_round,
|
650
|
+
}
|
651
|
+
)
|
652
|
+
meta_metadata = {"stage": "initial"}
|
653
|
+
experiment_config = self._prepare_experiment_config(
|
654
|
+
prompt=prompt,
|
655
|
+
dataset=dataset,
|
656
|
+
metric=metric,
|
657
|
+
experiment_config=experiment_config,
|
658
|
+
configuration_updates=configuration_updates,
|
659
|
+
additional_metadata={"meta_prompt": meta_metadata},
|
660
|
+
)
|
486
661
|
|
487
662
|
with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
|
488
663
|
initial_score = self._evaluate_prompt(
|
@@ -494,10 +669,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
494
669
|
experiment_config=experiment_config,
|
495
670
|
use_full_dataset=n_samples is None,
|
496
671
|
verbose=self.verbose,
|
672
|
+
mcp_config=mcp_config,
|
497
673
|
)
|
498
674
|
best_score = initial_score
|
499
675
|
best_prompt = current_prompt
|
500
|
-
rounds:
|
676
|
+
rounds: list[OptimizationRound] = []
|
501
677
|
|
502
678
|
baseline_reporter.set_score(initial_score)
|
503
679
|
|
@@ -510,8 +686,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
510
686
|
previous_best_score = best_score
|
511
687
|
|
512
688
|
# Step 1. Create a set of candidate prompts
|
689
|
+
generator = candidate_generator or self._generate_candidate_prompts
|
690
|
+
generator_kwargs = dict(candidate_generator_kwargs or {})
|
691
|
+
|
513
692
|
try:
|
514
|
-
candidate_prompts =
|
693
|
+
candidate_prompts = generator(
|
515
694
|
project_name=self.agent_class.project_name,
|
516
695
|
current_prompt=best_prompt,
|
517
696
|
best_score=best_score,
|
@@ -519,25 +698,25 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
519
698
|
previous_rounds=rounds,
|
520
699
|
metric=metric,
|
521
700
|
optimization_id=optimization_id,
|
701
|
+
**generator_kwargs,
|
522
702
|
)
|
523
703
|
except Exception as e:
|
524
704
|
round_reporter.failed_to_generate(self.num_prompts_per_round, e)
|
525
705
|
continue
|
526
706
|
|
527
707
|
# Step 2. Score each candidate prompt
|
528
|
-
prompt_scores:
|
708
|
+
prompt_scores: list[tuple[chat_prompt.ChatPrompt, float]] = []
|
529
709
|
for candidate_count, prompt in enumerate(candidate_prompts):
|
530
710
|
with reporting.display_prompt_candidate_scoring_report(
|
531
711
|
verbose=self.verbose
|
532
712
|
) as eval_report:
|
533
713
|
eval_report.set_generated_prompts(candidate_count, prompt)
|
534
714
|
|
535
|
-
|
536
|
-
new_prompt.set_messages(prompt.get_messages())
|
715
|
+
candidate_prompt = prompt.copy()
|
537
716
|
|
538
717
|
try:
|
539
718
|
prompt_score = self._evaluate_prompt(
|
540
|
-
prompt=
|
719
|
+
prompt=candidate_prompt,
|
541
720
|
optimization_id=optimization_id,
|
542
721
|
dataset=dataset,
|
543
722
|
metric=metric,
|
@@ -545,11 +724,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
545
724
|
use_full_dataset=False,
|
546
725
|
experiment_config=experiment_config,
|
547
726
|
verbose=self.verbose,
|
727
|
+
mcp_config=mcp_config,
|
548
728
|
)
|
549
729
|
|
550
730
|
eval_report.set_final_score(best_score, prompt_score)
|
551
731
|
except Exception:
|
552
|
-
|
732
|
+
logger.warning("Failed evaluating agent; continuing...")
|
553
733
|
prompt_score = 0
|
554
734
|
|
555
735
|
prompt_scores.append((prompt, prompt_score))
|
@@ -584,24 +764,39 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
584
764
|
best_score = best_cand_score_avg
|
585
765
|
best_prompt = best_candidate_this_round
|
586
766
|
|
767
|
+
if tool_panel_style and getattr(best_prompt, "tools", None):
|
768
|
+
description = (
|
769
|
+
best_prompt.tools[0].get("function", {}).get("description", "")
|
770
|
+
if best_prompt.tools
|
771
|
+
else ""
|
772
|
+
)
|
773
|
+
if description.strip():
|
774
|
+
reporting.display_tool_description(
|
775
|
+
description.strip(),
|
776
|
+
"Final tool description",
|
777
|
+
tool_panel_style,
|
778
|
+
)
|
779
|
+
|
587
780
|
reporting.display_result(
|
588
781
|
initial_score,
|
589
782
|
best_score,
|
590
783
|
best_prompt.get_messages() if best_prompt is not None else [],
|
591
784
|
verbose=self.verbose,
|
785
|
+
tools=getattr(best_prompt, "tools", None) if best_prompt else None,
|
592
786
|
)
|
593
787
|
|
594
788
|
return self._create_result(
|
595
789
|
metric,
|
596
|
-
initial_prompt=
|
597
|
-
|
598
|
-
|
790
|
+
initial_prompt=(
|
791
|
+
initial_prompt.get_messages() if initial_prompt is not None else []
|
792
|
+
),
|
599
793
|
best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
|
600
794
|
best_score=best_score,
|
601
795
|
initial_score=initial_score,
|
602
796
|
rounds=rounds,
|
603
797
|
dataset_id=dataset.id,
|
604
798
|
optimization_id=optimization_id,
|
799
|
+
best_tools=getattr(best_prompt, "tools", None) if best_prompt else None,
|
605
800
|
)
|
606
801
|
|
607
802
|
def _calculate_improvement(
|
@@ -620,19 +815,24 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
620
815
|
current_best_prompt: chat_prompt.ChatPrompt,
|
621
816
|
current_best_score: float,
|
622
817
|
best_prompt_overall: chat_prompt.ChatPrompt,
|
623
|
-
evaluated_candidates:
|
818
|
+
evaluated_candidates: list[tuple[chat_prompt.ChatPrompt, float]],
|
624
819
|
previous_best_score: float,
|
625
820
|
improvement_this_round: float,
|
626
821
|
) -> OptimizationRound:
|
627
822
|
"""Create an OptimizationRound object with the current round's data."""
|
628
|
-
generated_prompts_log = []
|
823
|
+
generated_prompts_log: list[dict[str, Any]] = []
|
629
824
|
for prompt, score in evaluated_candidates:
|
630
825
|
improvement_vs_prev = self._calculate_improvement(
|
631
826
|
score, previous_best_score
|
632
827
|
)
|
828
|
+
tool_entries: list[Any] = []
|
829
|
+
if getattr(prompt, "tools", None):
|
830
|
+
tool_entries = copy.deepcopy(list(prompt.tools or []))
|
831
|
+
|
633
832
|
generated_prompts_log.append(
|
634
833
|
{
|
635
834
|
"prompt": prompt.get_messages(),
|
835
|
+
"tools": tool_entries,
|
636
836
|
"score": score,
|
637
837
|
"improvement": improvement_vs_prev,
|
638
838
|
}
|
@@ -651,13 +851,14 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
651
851
|
def _create_result(
|
652
852
|
self,
|
653
853
|
metric: Callable,
|
654
|
-
initial_prompt:
|
655
|
-
best_prompt:
|
854
|
+
initial_prompt: list[dict[str, str]],
|
855
|
+
best_prompt: list[dict[str, str]],
|
656
856
|
best_score: float,
|
657
857
|
initial_score: float,
|
658
|
-
rounds:
|
659
|
-
dataset_id:
|
660
|
-
optimization_id:
|
858
|
+
rounds: list[OptimizationRound],
|
859
|
+
dataset_id: str | None,
|
860
|
+
optimization_id: str | None,
|
861
|
+
best_tools: list[dict[str, Any]] | None,
|
661
862
|
) -> OptimizationResult:
|
662
863
|
"""Create the final OptimizationResult object."""
|
663
864
|
details = {
|
@@ -670,6 +871,18 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
670
871
|
"temperature": self.model_kwargs.get("temperature"),
|
671
872
|
}
|
672
873
|
|
874
|
+
if best_tools:
|
875
|
+
details["final_tools"] = best_tools
|
876
|
+
|
877
|
+
tool_prompts = None
|
878
|
+
if best_tools:
|
879
|
+
tool_prompts = {
|
880
|
+
(tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
|
881
|
+
"function", {}
|
882
|
+
).get("description")
|
883
|
+
for idx, tool in enumerate(best_tools)
|
884
|
+
}
|
885
|
+
|
673
886
|
return OptimizationResult(
|
674
887
|
optimizer=self.__class__.__name__,
|
675
888
|
prompt=best_prompt,
|
@@ -679,8 +892,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
679
892
|
metric_name=getattr(metric, "__name__", str(metric)),
|
680
893
|
details=details,
|
681
894
|
llm_calls=self.llm_call_counter,
|
895
|
+
tool_calls=self.tool_call_counter,
|
682
896
|
dataset_id=dataset_id,
|
683
897
|
optimization_id=optimization_id,
|
898
|
+
tool_prompts=tool_prompts,
|
684
899
|
)
|
685
900
|
|
686
901
|
def _get_task_context(self, metric: Callable) -> str:
|
@@ -714,11 +929,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
714
929
|
current_prompt: chat_prompt.ChatPrompt,
|
715
930
|
best_score: float,
|
716
931
|
round_num: int,
|
717
|
-
previous_rounds:
|
932
|
+
previous_rounds: list[OptimizationRound],
|
718
933
|
metric: Callable,
|
719
|
-
optimization_id:
|
720
|
-
project_name:
|
721
|
-
) ->
|
934
|
+
optimization_id: str | None = None,
|
935
|
+
project_name: str | None = None,
|
936
|
+
) -> list[chat_prompt.ChatPrompt]:
|
722
937
|
"""Generate candidate prompts using meta-prompting."""
|
723
938
|
with reporting.display_candidate_generation_report(
|
724
939
|
self.num_prompts_per_round, verbose=self.verbose
|
@@ -819,7 +1034,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
819
1034
|
)
|
820
1035
|
|
821
1036
|
# Extract and log valid prompts
|
822
|
-
valid_prompts:
|
1037
|
+
valid_prompts: list[chat_prompt.ChatPrompt] = []
|
823
1038
|
for item in json_result["prompts"]:
|
824
1039
|
if (
|
825
1040
|
isinstance(item, dict)
|
@@ -870,7 +1085,128 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
870
1085
|
f"Unexpected error during candidate prompt generation: {e}"
|
871
1086
|
)
|
872
1087
|
|
873
|
-
def
|
1088
|
+
def _generate_mcp_candidate_prompts(
|
1089
|
+
self,
|
1090
|
+
current_prompt: chat_prompt.ChatPrompt,
|
1091
|
+
best_score: float,
|
1092
|
+
round_num: int,
|
1093
|
+
previous_rounds: list[OptimizationRound],
|
1094
|
+
metric: Callable,
|
1095
|
+
tool_segment_id: str,
|
1096
|
+
tool_name: str,
|
1097
|
+
optimization_id: str | None = None,
|
1098
|
+
project_name: str | None = None,
|
1099
|
+
panel_style: str = "bright_magenta",
|
1100
|
+
) -> list[chat_prompt.ChatPrompt]:
|
1101
|
+
segments = {
|
1102
|
+
segment.segment_id: segment
|
1103
|
+
for segment in extract_prompt_segments(current_prompt)
|
1104
|
+
}
|
1105
|
+
if tool_segment_id not in segments:
|
1106
|
+
raise ValueError(f"Tool segment '{tool_segment_id}' not found in prompt")
|
1107
|
+
|
1108
|
+
target_segment = segments[tool_segment_id]
|
1109
|
+
current_description = target_segment.content
|
1110
|
+
tool_metadata = target_segment.metadata.get("raw_tool", {})
|
1111
|
+
|
1112
|
+
history_context = self._build_history_context(previous_rounds)
|
1113
|
+
|
1114
|
+
instruction = textwrap.dedent(
|
1115
|
+
f"""
|
1116
|
+
Current tool name: {tool_name}
|
1117
|
+
Current tool description:
|
1118
|
+
---
|
1119
|
+
{current_description}
|
1120
|
+
---
|
1121
|
+
|
1122
|
+
Tool metadata (JSON):
|
1123
|
+
{json.dumps(tool_metadata, indent=2)}
|
1124
|
+
|
1125
|
+
Current best score: {best_score:.4f}
|
1126
|
+
{history_context}
|
1127
|
+
|
1128
|
+
Generate {self.num_prompts_per_round} improved descriptions for this tool.
|
1129
|
+
Each description should clarify expected input arguments and set explicit expectations
|
1130
|
+
for how the tool output must be used in the final response.
|
1131
|
+
Avoid changing unrelated parts of the prompt. Focus only on the description text for `{tool_name}`.
|
1132
|
+
|
1133
|
+
Return a JSON object of the form:
|
1134
|
+
{{
|
1135
|
+
"prompts": [
|
1136
|
+
{{
|
1137
|
+
"tool_description": "...",
|
1138
|
+
"improvement_focus": "...",
|
1139
|
+
"reasoning": "..."
|
1140
|
+
}}
|
1141
|
+
]
|
1142
|
+
}}
|
1143
|
+
"""
|
1144
|
+
).strip()
|
1145
|
+
|
1146
|
+
with reporting.display_candidate_generation_report(
|
1147
|
+
self.num_prompts_per_round, verbose=self.verbose
|
1148
|
+
) as candidate_generation_report:
|
1149
|
+
try:
|
1150
|
+
content = self._call_model(
|
1151
|
+
project_name,
|
1152
|
+
messages=[
|
1153
|
+
{"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
|
1154
|
+
{"role": "user", "content": instruction},
|
1155
|
+
],
|
1156
|
+
is_reasoning=True,
|
1157
|
+
optimization_id=optimization_id,
|
1158
|
+
)
|
1159
|
+
|
1160
|
+
try:
|
1161
|
+
json_result = json.loads(content)
|
1162
|
+
except json.JSONDecodeError:
|
1163
|
+
import re
|
1164
|
+
|
1165
|
+
json_match = re.search(r"\{.*\}", content, re.DOTALL)
|
1166
|
+
if not json_match:
|
1167
|
+
raise ValueError("No JSON object found in reasoning output")
|
1168
|
+
json_result = json.loads(json_match.group())
|
1169
|
+
|
1170
|
+
prompts_payload = json_result.get("prompts")
|
1171
|
+
if not isinstance(prompts_payload, list):
|
1172
|
+
raise ValueError("Reasoning output missing 'prompts' list")
|
1173
|
+
|
1174
|
+
candidate_generation_report.set_generated_prompts()
|
1175
|
+
|
1176
|
+
candidates: list[chat_prompt.ChatPrompt] = []
|
1177
|
+
for item in prompts_payload:
|
1178
|
+
if not isinstance(item, dict):
|
1179
|
+
continue
|
1180
|
+
description = item.get("tool_description")
|
1181
|
+
if not isinstance(description, str) or not description.strip():
|
1182
|
+
continue
|
1183
|
+
|
1184
|
+
updated_prompt = apply_segment_updates(
|
1185
|
+
current_prompt,
|
1186
|
+
{tool_segment_id: description.strip()},
|
1187
|
+
)
|
1188
|
+
_sync_tool_description_in_system(updated_prompt)
|
1189
|
+
if (
|
1190
|
+
description.strip()
|
1191
|
+
and description.strip() != current_description.strip()
|
1192
|
+
):
|
1193
|
+
reporting.display_tool_description(
|
1194
|
+
description.strip(),
|
1195
|
+
f"Round {round_num + 1} tool description",
|
1196
|
+
panel_style,
|
1197
|
+
)
|
1198
|
+
candidates.append(updated_prompt)
|
1199
|
+
|
1200
|
+
if not candidates:
|
1201
|
+
raise ValueError(
|
1202
|
+
"Reasoning output did not produce valid tool descriptions"
|
1203
|
+
)
|
1204
|
+
|
1205
|
+
return candidates
|
1206
|
+
except Exception as exc:
|
1207
|
+
raise ValueError(f"Error generating MCP prompt candidates: {exc}")
|
1208
|
+
|
1209
|
+
def _build_history_context(self, previous_rounds: list[OptimizationRound]) -> str:
|
874
1210
|
"""Build context from previous optimization rounds."""
|
875
1211
|
if not previous_rounds:
|
876
1212
|
return ""
|
@@ -896,7 +1232,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
896
1232
|
|
897
1233
|
def _get_evaluation_subset(
|
898
1234
|
self, dataset: opik.Dataset, min_size: int = 20, max_size: int = 100
|
899
|
-
) ->
|
1235
|
+
) -> list[dict[str, Any]]:
|
900
1236
|
"""Get a random subset of the dataset for evaluation.
|
901
1237
|
|
902
1238
|
Returns:
|