opik-optimizer 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +28 -11
- opik_optimizer/colbert.py +236 -0
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +152 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +20 -20
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +18 -17
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +21 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/{utils.py → utils/core.py} +111 -26
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
- opik_optimizer-1.1.0.dist-info/RECORD +73 -0
- opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.5.dist-info/RECORD +0 -50
- opik_optimizer-1.0.5.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,10 @@
|
|
1
|
+
import copy
|
1
2
|
import json
|
2
3
|
import logging
|
3
4
|
import os
|
4
|
-
|
5
|
+
import textwrap
|
6
|
+
from typing import Any, cast
|
7
|
+
from collections.abc import Callable
|
5
8
|
|
6
9
|
import litellm
|
7
10
|
import opik
|
@@ -13,7 +16,7 @@ from opik.environment import get_tqdm_for_current_environment
|
|
13
16
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
14
17
|
|
15
18
|
from opik_optimizer import task_evaluator
|
16
|
-
from
|
19
|
+
from ..utils.core import create_litellm_agent_class
|
17
20
|
|
18
21
|
from .. import _throttle
|
19
22
|
from ..base_optimizer import BaseOptimizer, OptimizationRound
|
@@ -21,6 +24,15 @@ from ..optimization_config import chat_prompt, mappers
|
|
21
24
|
from ..optimization_result import OptimizationResult
|
22
25
|
from ..optimizable_agent import OptimizableAgent
|
23
26
|
from . import reporting
|
27
|
+
import re
|
28
|
+
|
29
|
+
from ..mcp_utils.mcp import PROMPT_TOOL_FOOTER, PROMPT_TOOL_HEADER
|
30
|
+
from ..mcp_utils.mcp_workflow import (
|
31
|
+
MCPExecutionConfig,
|
32
|
+
MCPSecondPassCoordinator,
|
33
|
+
extract_tool_arguments,
|
34
|
+
)
|
35
|
+
from ..utils.prompt_segments import apply_segment_updates, extract_prompt_segments
|
24
36
|
|
25
37
|
tqdm = get_tqdm_for_current_environment()
|
26
38
|
|
@@ -34,6 +46,48 @@ logger = logging.getLogger(__name__) # Gets logger configured by setup_logging
|
|
34
46
|
_rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
35
47
|
|
36
48
|
|
49
|
+
def _sync_tool_description_in_system(prompt: chat_prompt.ChatPrompt) -> None:
|
50
|
+
if not prompt.system or not getattr(prompt, "tools", None):
|
51
|
+
return
|
52
|
+
|
53
|
+
description = (
|
54
|
+
prompt.tools[0].get("function", {}).get("description") if prompt.tools else None
|
55
|
+
)
|
56
|
+
if not description:
|
57
|
+
return
|
58
|
+
|
59
|
+
tool_name = (
|
60
|
+
prompt.tools[0].get("function", {}).get("name") if prompt.tools else None
|
61
|
+
)
|
62
|
+
|
63
|
+
system_text = cast(str, prompt.system)
|
64
|
+
if PROMPT_TOOL_HEADER not in system_text or PROMPT_TOOL_FOOTER not in system_text:
|
65
|
+
return
|
66
|
+
|
67
|
+
start = system_text.index(PROMPT_TOOL_HEADER) + len(PROMPT_TOOL_HEADER)
|
68
|
+
end = system_text.index(PROMPT_TOOL_FOOTER)
|
69
|
+
description_text = description.strip()
|
70
|
+
system_text = (
|
71
|
+
system_text[:start] + "\n" + description_text + "\n" + system_text[end:]
|
72
|
+
)
|
73
|
+
prompt.system = system_text
|
74
|
+
|
75
|
+
if tool_name:
|
76
|
+
pattern = rf"(-\s*{re.escape(tool_name)}:\s)(.*)"
|
77
|
+
|
78
|
+
def _tool_section_replacer(match: re.Match[str]) -> str:
|
79
|
+
return f"{match.group(1)}{description_text}"
|
80
|
+
|
81
|
+
system_text = re.sub(
|
82
|
+
pattern,
|
83
|
+
_tool_section_replacer,
|
84
|
+
system_text,
|
85
|
+
count=1,
|
86
|
+
flags=re.MULTILINE,
|
87
|
+
)
|
88
|
+
prompt.system = system_text
|
89
|
+
|
90
|
+
|
37
91
|
class MetaPromptOptimizer(BaseOptimizer):
|
38
92
|
"""
|
39
93
|
The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
|
@@ -82,10 +136,10 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
82
136
|
def __init__(
|
83
137
|
self,
|
84
138
|
model: str,
|
85
|
-
reasoning_model:
|
139
|
+
reasoning_model: str | None = None,
|
86
140
|
rounds: int = DEFAULT_ROUNDS,
|
87
141
|
num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
|
88
|
-
num_threads:
|
142
|
+
num_threads: int | None = None,
|
89
143
|
verbose: int = 1,
|
90
144
|
enable_context: bool = True,
|
91
145
|
n_threads: int = 12,
|
@@ -116,7 +170,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
116
170
|
print("num_threads is deprecated; use n_threads instead")
|
117
171
|
n_threads = num_threads
|
118
172
|
self.num_threads = n_threads
|
119
|
-
self.dataset:
|
173
|
+
self.dataset: Dataset | None = None
|
120
174
|
self._opik_client = opik_client.get_client_cached()
|
121
175
|
self.llm_call_counter = 0
|
122
176
|
self.enable_context = enable_context
|
@@ -131,9 +185,9 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
131
185
|
def _call_model(
|
132
186
|
self,
|
133
187
|
project_name: str,
|
134
|
-
messages:
|
188
|
+
messages: list[dict[str, str]],
|
135
189
|
is_reasoning: bool = False,
|
136
|
-
optimization_id:
|
190
|
+
optimization_id: str | None = None,
|
137
191
|
) -> str:
|
138
192
|
"""Call the model with the given prompt and return the response."""
|
139
193
|
self.llm_call_counter += 1
|
@@ -163,7 +217,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
163
217
|
}
|
164
218
|
|
165
219
|
# Prepare metadata that we want to be part of the LLM call context.
|
166
|
-
metadata_for_opik:
|
220
|
+
metadata_for_opik: dict[str, Any] = {}
|
167
221
|
if project_name:
|
168
222
|
metadata_for_opik["project_name"] = (
|
169
223
|
project_name # Top-level for general use
|
@@ -225,11 +279,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
225
279
|
prompt: chat_prompt.ChatPrompt,
|
226
280
|
dataset: opik.Dataset,
|
227
281
|
metric: Callable,
|
228
|
-
n_samples:
|
229
|
-
dataset_item_ids:
|
230
|
-
experiment_config:
|
282
|
+
n_samples: int | None = None,
|
283
|
+
dataset_item_ids: list[str] | None = None,
|
284
|
+
experiment_config: dict | None = None,
|
231
285
|
use_full_dataset: bool = True,
|
232
|
-
optimization_id:
|
286
|
+
optimization_id: str | None = None,
|
287
|
+
mcp_config: MCPExecutionConfig | None = None,
|
233
288
|
**kwargs: Any,
|
234
289
|
) -> float:
|
235
290
|
"""
|
@@ -277,6 +332,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
277
332
|
"dataset": dataset.name,
|
278
333
|
"configuration": {
|
279
334
|
"prompt": prompt.get_messages(),
|
335
|
+
"tools": getattr(prompt, "tools", None),
|
280
336
|
"n_samples": subset_size,
|
281
337
|
"use_full_dataset": use_full_dataset,
|
282
338
|
},
|
@@ -285,39 +341,80 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
285
341
|
if optimization_id:
|
286
342
|
experiment_config["optimization_id"] = optimization_id
|
287
343
|
|
288
|
-
def llm_task(dataset_item:
|
289
|
-
# --- Step 1: Prepare the prompt for the LLM ---
|
290
|
-
# messages = [
|
291
|
-
# {
|
292
|
-
# "role": item["role"],
|
293
|
-
# "content": item["content"].format(**dataset_item),
|
294
|
-
# }
|
295
|
-
# for item in prompt.get_messages()
|
296
|
-
# ]
|
297
|
-
# Step 1: create the agent
|
344
|
+
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
298
345
|
new_prompt = prompt.copy()
|
299
346
|
messages = new_prompt.get_messages(dataset_item)
|
300
347
|
new_prompt.set_messages(messages)
|
301
348
|
agent = self.agent_class(new_prompt)
|
302
349
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
350
|
+
if mcp_config is not None:
|
351
|
+
coordinator = mcp_config.coordinator
|
352
|
+
coordinator.reset()
|
353
|
+
try:
|
354
|
+
logger.debug(
|
355
|
+
"Calling MCP-enabled LLM with tool access; prompt length=%s",
|
356
|
+
sum(len(msg["content"]) for msg in messages),
|
357
|
+
)
|
358
|
+
raw_model_output = agent.llm_invoke(
|
359
|
+
messages=messages,
|
360
|
+
seed=None,
|
361
|
+
allow_tool_use=True,
|
362
|
+
)
|
363
|
+
except Exception as exc:
|
364
|
+
logger.error("Error during MCP first pass: %s", exc)
|
365
|
+
raise
|
366
|
+
|
367
|
+
second_pass_messages = coordinator.build_second_pass_messages(
|
368
|
+
base_messages=messages,
|
369
|
+
dataset_item=dataset_item,
|
316
370
|
)
|
317
|
-
raise
|
318
371
|
|
319
|
-
|
320
|
-
|
372
|
+
if second_pass_messages is None and mcp_config.fallback_invoker:
|
373
|
+
fallback_args = mcp_config.fallback_arguments(dataset_item)
|
374
|
+
if fallback_args:
|
375
|
+
logger.debug(
|
376
|
+
"MCP fallback triggered for tool %s with args=%s",
|
377
|
+
mcp_config.tool_name,
|
378
|
+
fallback_args,
|
379
|
+
)
|
380
|
+
summary_override = mcp_config.fallback_invoker(fallback_args)
|
381
|
+
second_pass_messages = coordinator.build_second_pass_messages(
|
382
|
+
base_messages=messages,
|
383
|
+
dataset_item=dataset_item,
|
384
|
+
summary_override=summary_override,
|
385
|
+
)
|
386
|
+
|
387
|
+
if second_pass_messages is not None:
|
388
|
+
logger.debug(
|
389
|
+
"Executing MCP second pass with %d messages",
|
390
|
+
len(second_pass_messages),
|
391
|
+
)
|
392
|
+
final_response = agent.llm_invoke(
|
393
|
+
messages=second_pass_messages,
|
394
|
+
seed=None,
|
395
|
+
allow_tool_use=mcp_config.allow_tool_use_on_second_pass,
|
396
|
+
)
|
397
|
+
else:
|
398
|
+
final_response = raw_model_output
|
399
|
+
|
400
|
+
cleaned_model_output = final_response.strip()
|
401
|
+
else:
|
402
|
+
try:
|
403
|
+
logger.debug(
|
404
|
+
f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
|
405
|
+
)
|
406
|
+
raw_model_output = agent.invoke(messages)
|
407
|
+
logger.debug(f"LLM raw response length: {len(raw_model_output)}")
|
408
|
+
logger.debug(f"LLM raw output: {raw_model_output}")
|
409
|
+
except Exception as e:
|
410
|
+
logger.error(f"Error calling model with prompt: {e}")
|
411
|
+
logger.error(f"Failed prompt: {messages}")
|
412
|
+
logger.error(
|
413
|
+
f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
|
414
|
+
)
|
415
|
+
raise
|
416
|
+
|
417
|
+
cleaned_model_output = raw_model_output.strip()
|
321
418
|
|
322
419
|
result = {
|
323
420
|
mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
|
@@ -348,12 +445,16 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
348
445
|
prompt: chat_prompt.ChatPrompt,
|
349
446
|
dataset: Dataset,
|
350
447
|
metric: Callable,
|
351
|
-
experiment_config:
|
352
|
-
n_samples:
|
448
|
+
experiment_config: dict | None = None,
|
449
|
+
n_samples: int | None = None,
|
353
450
|
auto_continue: bool = False,
|
354
|
-
agent_class:
|
451
|
+
agent_class: type[OptimizableAgent] | None = None,
|
355
452
|
**kwargs: Any,
|
356
453
|
) -> OptimizationResult:
|
454
|
+
mcp_config = kwargs.pop("mcp_config", None)
|
455
|
+
candidate_generator = kwargs.pop("candidate_generator", None)
|
456
|
+
candidate_generator_kwargs = kwargs.pop("candidate_generator_kwargs", None)
|
457
|
+
|
357
458
|
"""
|
358
459
|
Optimize a prompt using meta-reasoning.
|
359
460
|
|
@@ -385,7 +486,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
385
486
|
prompt.model_kwargs = self.model_kwargs
|
386
487
|
|
387
488
|
if agent_class is None:
|
388
|
-
self.agent_class =
|
489
|
+
self.agent_class = create_litellm_agent_class(prompt)
|
389
490
|
else:
|
390
491
|
self.agent_class = agent_class
|
391
492
|
|
@@ -424,6 +525,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
424
525
|
"auto_continue": auto_continue,
|
425
526
|
},
|
426
527
|
verbose=self.verbose,
|
528
|
+
tools=getattr(prompt, "tools", None),
|
427
529
|
)
|
428
530
|
|
429
531
|
try:
|
@@ -436,6 +538,9 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
436
538
|
experiment_config=experiment_config,
|
437
539
|
n_samples=n_samples,
|
438
540
|
auto_continue=auto_continue,
|
541
|
+
mcp_config=mcp_config,
|
542
|
+
candidate_generator=candidate_generator,
|
543
|
+
candidate_generator_kwargs=candidate_generator_kwargs,
|
439
544
|
**kwargs,
|
440
545
|
)
|
441
546
|
if optimization:
|
@@ -449,15 +554,80 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
449
554
|
logger.debug("Optimization marked as cancelled")
|
450
555
|
raise e
|
451
556
|
|
557
|
+
def optimize_mcp(
|
558
|
+
self,
|
559
|
+
prompt: chat_prompt.ChatPrompt,
|
560
|
+
dataset: Dataset,
|
561
|
+
metric: Callable,
|
562
|
+
*,
|
563
|
+
tool_name: str,
|
564
|
+
second_pass: MCPSecondPassCoordinator,
|
565
|
+
experiment_config: dict | None = None,
|
566
|
+
n_samples: int | None = None,
|
567
|
+
auto_continue: bool = False,
|
568
|
+
agent_class: type[OptimizableAgent] | None = None,
|
569
|
+
fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
|
570
|
+
fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
|
571
|
+
allow_tool_use_on_second_pass: bool = False,
|
572
|
+
**kwargs: Any,
|
573
|
+
) -> OptimizationResult:
|
574
|
+
panel_style = kwargs.pop("tool_panel_style", "bright_magenta")
|
575
|
+
|
576
|
+
if prompt.tools is None or not prompt.tools:
|
577
|
+
raise ValueError("Prompt must include tools for MCP optimization")
|
578
|
+
|
579
|
+
fallback_args_fn = fallback_arguments or extract_tool_arguments
|
580
|
+
|
581
|
+
if fallback_invoker is None:
|
582
|
+
function_map = prompt.function_map or {}
|
583
|
+
fallback_invoker = function_map.get(tool_name)
|
584
|
+
|
585
|
+
mcp_config = MCPExecutionConfig(
|
586
|
+
coordinator=second_pass,
|
587
|
+
tool_name=tool_name,
|
588
|
+
fallback_arguments=fallback_args_fn,
|
589
|
+
fallback_invoker=fallback_invoker,
|
590
|
+
allow_tool_use_on_second_pass=allow_tool_use_on_second_pass,
|
591
|
+
)
|
592
|
+
|
593
|
+
tool_segment_id = f"tool:{tool_name}"
|
594
|
+
segments = extract_prompt_segments(prompt)
|
595
|
+
if tool_segment_id not in {segment.segment_id for segment in segments}:
|
596
|
+
raise ValueError(f"Tool '{tool_name}' not present in prompt tools")
|
597
|
+
|
598
|
+
return self.optimize_prompt(
|
599
|
+
prompt=prompt,
|
600
|
+
dataset=dataset,
|
601
|
+
metric=metric,
|
602
|
+
experiment_config=experiment_config,
|
603
|
+
n_samples=n_samples,
|
604
|
+
auto_continue=auto_continue,
|
605
|
+
agent_class=agent_class,
|
606
|
+
mcp_config=mcp_config,
|
607
|
+
candidate_generator=self._generate_mcp_candidate_prompts,
|
608
|
+
candidate_generator_kwargs={
|
609
|
+
"tool_segment_id": tool_segment_id,
|
610
|
+
"tool_name": tool_name,
|
611
|
+
"panel_style": panel_style,
|
612
|
+
},
|
613
|
+
tool_panel_style=panel_style,
|
614
|
+
**kwargs,
|
615
|
+
)
|
616
|
+
|
452
617
|
def _optimize_prompt(
|
453
618
|
self,
|
454
|
-
optimization_id:
|
619
|
+
optimization_id: str | None,
|
455
620
|
prompt: chat_prompt.ChatPrompt,
|
456
621
|
dataset: Dataset,
|
457
622
|
metric: Callable,
|
458
|
-
experiment_config:
|
459
|
-
n_samples:
|
623
|
+
experiment_config: dict | None,
|
624
|
+
n_samples: int | None,
|
460
625
|
auto_continue: bool,
|
626
|
+
mcp_config: MCPExecutionConfig | None = None,
|
627
|
+
candidate_generator: None
|
628
|
+
| (Callable[..., list[chat_prompt.ChatPrompt]]) = None,
|
629
|
+
candidate_generator_kwargs: dict[str, Any] | None = None,
|
630
|
+
tool_panel_style: str = "bright_magenta",
|
461
631
|
**kwargs: Any,
|
462
632
|
) -> OptimizationResult:
|
463
633
|
self.auto_continue = auto_continue
|
@@ -494,10 +664,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
494
664
|
experiment_config=experiment_config,
|
495
665
|
use_full_dataset=n_samples is None,
|
496
666
|
verbose=self.verbose,
|
667
|
+
mcp_config=mcp_config,
|
497
668
|
)
|
498
669
|
best_score = initial_score
|
499
670
|
best_prompt = current_prompt
|
500
|
-
rounds:
|
671
|
+
rounds: list[OptimizationRound] = []
|
501
672
|
|
502
673
|
baseline_reporter.set_score(initial_score)
|
503
674
|
|
@@ -510,8 +681,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
510
681
|
previous_best_score = best_score
|
511
682
|
|
512
683
|
# Step 1. Create a set of candidate prompts
|
684
|
+
generator = candidate_generator or self._generate_candidate_prompts
|
685
|
+
generator_kwargs = dict(candidate_generator_kwargs or {})
|
686
|
+
|
513
687
|
try:
|
514
|
-
candidate_prompts =
|
688
|
+
candidate_prompts = generator(
|
515
689
|
project_name=self.agent_class.project_name,
|
516
690
|
current_prompt=best_prompt,
|
517
691
|
best_score=best_score,
|
@@ -519,25 +693,25 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
519
693
|
previous_rounds=rounds,
|
520
694
|
metric=metric,
|
521
695
|
optimization_id=optimization_id,
|
696
|
+
**generator_kwargs,
|
522
697
|
)
|
523
698
|
except Exception as e:
|
524
699
|
round_reporter.failed_to_generate(self.num_prompts_per_round, e)
|
525
700
|
continue
|
526
701
|
|
527
702
|
# Step 2. Score each candidate prompt
|
528
|
-
prompt_scores:
|
703
|
+
prompt_scores: list[tuple[chat_prompt.ChatPrompt, float]] = []
|
529
704
|
for candidate_count, prompt in enumerate(candidate_prompts):
|
530
705
|
with reporting.display_prompt_candidate_scoring_report(
|
531
706
|
verbose=self.verbose
|
532
707
|
) as eval_report:
|
533
708
|
eval_report.set_generated_prompts(candidate_count, prompt)
|
534
709
|
|
535
|
-
|
536
|
-
new_prompt.set_messages(prompt.get_messages())
|
710
|
+
candidate_prompt = prompt.copy()
|
537
711
|
|
538
712
|
try:
|
539
713
|
prompt_score = self._evaluate_prompt(
|
540
|
-
prompt=
|
714
|
+
prompt=candidate_prompt,
|
541
715
|
optimization_id=optimization_id,
|
542
716
|
dataset=dataset,
|
543
717
|
metric=metric,
|
@@ -545,11 +719,12 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
545
719
|
use_full_dataset=False,
|
546
720
|
experiment_config=experiment_config,
|
547
721
|
verbose=self.verbose,
|
722
|
+
mcp_config=mcp_config,
|
548
723
|
)
|
549
724
|
|
550
725
|
eval_report.set_final_score(best_score, prompt_score)
|
551
726
|
except Exception:
|
552
|
-
|
727
|
+
logger.warning("Failed evaluating agent; continuing...")
|
553
728
|
prompt_score = 0
|
554
729
|
|
555
730
|
prompt_scores.append((prompt, prompt_score))
|
@@ -584,24 +759,39 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
584
759
|
best_score = best_cand_score_avg
|
585
760
|
best_prompt = best_candidate_this_round
|
586
761
|
|
762
|
+
if tool_panel_style and getattr(best_prompt, "tools", None):
|
763
|
+
description = (
|
764
|
+
best_prompt.tools[0].get("function", {}).get("description", "")
|
765
|
+
if best_prompt.tools
|
766
|
+
else ""
|
767
|
+
)
|
768
|
+
if description.strip():
|
769
|
+
reporting.display_tool_description(
|
770
|
+
description.strip(),
|
771
|
+
"Final tool description",
|
772
|
+
tool_panel_style,
|
773
|
+
)
|
774
|
+
|
587
775
|
reporting.display_result(
|
588
776
|
initial_score,
|
589
777
|
best_score,
|
590
778
|
best_prompt.get_messages() if best_prompt is not None else [],
|
591
779
|
verbose=self.verbose,
|
780
|
+
tools=getattr(best_prompt, "tools", None) if best_prompt else None,
|
592
781
|
)
|
593
782
|
|
594
783
|
return self._create_result(
|
595
784
|
metric,
|
596
|
-
initial_prompt=
|
597
|
-
|
598
|
-
|
785
|
+
initial_prompt=(
|
786
|
+
initial_prompt.get_messages() if initial_prompt is not None else []
|
787
|
+
),
|
599
788
|
best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
|
600
789
|
best_score=best_score,
|
601
790
|
initial_score=initial_score,
|
602
791
|
rounds=rounds,
|
603
792
|
dataset_id=dataset.id,
|
604
793
|
optimization_id=optimization_id,
|
794
|
+
best_tools=getattr(best_prompt, "tools", None) if best_prompt else None,
|
605
795
|
)
|
606
796
|
|
607
797
|
def _calculate_improvement(
|
@@ -620,19 +810,24 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
620
810
|
current_best_prompt: chat_prompt.ChatPrompt,
|
621
811
|
current_best_score: float,
|
622
812
|
best_prompt_overall: chat_prompt.ChatPrompt,
|
623
|
-
evaluated_candidates:
|
813
|
+
evaluated_candidates: list[tuple[chat_prompt.ChatPrompt, float]],
|
624
814
|
previous_best_score: float,
|
625
815
|
improvement_this_round: float,
|
626
816
|
) -> OptimizationRound:
|
627
817
|
"""Create an OptimizationRound object with the current round's data."""
|
628
|
-
generated_prompts_log = []
|
818
|
+
generated_prompts_log: list[dict[str, Any]] = []
|
629
819
|
for prompt, score in evaluated_candidates:
|
630
820
|
improvement_vs_prev = self._calculate_improvement(
|
631
821
|
score, previous_best_score
|
632
822
|
)
|
823
|
+
tool_entries: list[Any] = []
|
824
|
+
if getattr(prompt, "tools", None):
|
825
|
+
tool_entries = copy.deepcopy(list(prompt.tools or []))
|
826
|
+
|
633
827
|
generated_prompts_log.append(
|
634
828
|
{
|
635
829
|
"prompt": prompt.get_messages(),
|
830
|
+
"tools": tool_entries,
|
636
831
|
"score": score,
|
637
832
|
"improvement": improvement_vs_prev,
|
638
833
|
}
|
@@ -651,13 +846,14 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
651
846
|
def _create_result(
|
652
847
|
self,
|
653
848
|
metric: Callable,
|
654
|
-
initial_prompt:
|
655
|
-
best_prompt:
|
849
|
+
initial_prompt: list[dict[str, str]],
|
850
|
+
best_prompt: list[dict[str, str]],
|
656
851
|
best_score: float,
|
657
852
|
initial_score: float,
|
658
|
-
rounds:
|
659
|
-
dataset_id:
|
660
|
-
optimization_id:
|
853
|
+
rounds: list[OptimizationRound],
|
854
|
+
dataset_id: str | None,
|
855
|
+
optimization_id: str | None,
|
856
|
+
best_tools: list[dict[str, Any]] | None,
|
661
857
|
) -> OptimizationResult:
|
662
858
|
"""Create the final OptimizationResult object."""
|
663
859
|
details = {
|
@@ -670,6 +866,18 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
670
866
|
"temperature": self.model_kwargs.get("temperature"),
|
671
867
|
}
|
672
868
|
|
869
|
+
if best_tools:
|
870
|
+
details["final_tools"] = best_tools
|
871
|
+
|
872
|
+
tool_prompts = None
|
873
|
+
if best_tools:
|
874
|
+
tool_prompts = {
|
875
|
+
(tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
|
876
|
+
"function", {}
|
877
|
+
).get("description")
|
878
|
+
for idx, tool in enumerate(best_tools)
|
879
|
+
}
|
880
|
+
|
673
881
|
return OptimizationResult(
|
674
882
|
optimizer=self.__class__.__name__,
|
675
883
|
prompt=best_prompt,
|
@@ -681,6 +889,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
681
889
|
llm_calls=self.llm_call_counter,
|
682
890
|
dataset_id=dataset_id,
|
683
891
|
optimization_id=optimization_id,
|
892
|
+
tool_prompts=tool_prompts,
|
684
893
|
)
|
685
894
|
|
686
895
|
def _get_task_context(self, metric: Callable) -> str:
|
@@ -714,11 +923,11 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
714
923
|
current_prompt: chat_prompt.ChatPrompt,
|
715
924
|
best_score: float,
|
716
925
|
round_num: int,
|
717
|
-
previous_rounds:
|
926
|
+
previous_rounds: list[OptimizationRound],
|
718
927
|
metric: Callable,
|
719
|
-
optimization_id:
|
720
|
-
project_name:
|
721
|
-
) ->
|
928
|
+
optimization_id: str | None = None,
|
929
|
+
project_name: str | None = None,
|
930
|
+
) -> list[chat_prompt.ChatPrompt]:
|
722
931
|
"""Generate candidate prompts using meta-prompting."""
|
723
932
|
with reporting.display_candidate_generation_report(
|
724
933
|
self.num_prompts_per_round, verbose=self.verbose
|
@@ -819,7 +1028,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
819
1028
|
)
|
820
1029
|
|
821
1030
|
# Extract and log valid prompts
|
822
|
-
valid_prompts:
|
1031
|
+
valid_prompts: list[chat_prompt.ChatPrompt] = []
|
823
1032
|
for item in json_result["prompts"]:
|
824
1033
|
if (
|
825
1034
|
isinstance(item, dict)
|
@@ -870,7 +1079,128 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
870
1079
|
f"Unexpected error during candidate prompt generation: {e}"
|
871
1080
|
)
|
872
1081
|
|
873
|
-
def
|
1082
|
+
def _generate_mcp_candidate_prompts(
|
1083
|
+
self,
|
1084
|
+
current_prompt: chat_prompt.ChatPrompt,
|
1085
|
+
best_score: float,
|
1086
|
+
round_num: int,
|
1087
|
+
previous_rounds: list[OptimizationRound],
|
1088
|
+
metric: Callable,
|
1089
|
+
tool_segment_id: str,
|
1090
|
+
tool_name: str,
|
1091
|
+
optimization_id: str | None = None,
|
1092
|
+
project_name: str | None = None,
|
1093
|
+
panel_style: str = "bright_magenta",
|
1094
|
+
) -> list[chat_prompt.ChatPrompt]:
|
1095
|
+
segments = {
|
1096
|
+
segment.segment_id: segment
|
1097
|
+
for segment in extract_prompt_segments(current_prompt)
|
1098
|
+
}
|
1099
|
+
if tool_segment_id not in segments:
|
1100
|
+
raise ValueError(f"Tool segment '{tool_segment_id}' not found in prompt")
|
1101
|
+
|
1102
|
+
target_segment = segments[tool_segment_id]
|
1103
|
+
current_description = target_segment.content
|
1104
|
+
tool_metadata = target_segment.metadata.get("raw_tool", {})
|
1105
|
+
|
1106
|
+
history_context = self._build_history_context(previous_rounds)
|
1107
|
+
|
1108
|
+
instruction = textwrap.dedent(
|
1109
|
+
f"""
|
1110
|
+
Current tool name: {tool_name}
|
1111
|
+
Current tool description:
|
1112
|
+
---
|
1113
|
+
{current_description}
|
1114
|
+
---
|
1115
|
+
|
1116
|
+
Tool metadata (JSON):
|
1117
|
+
{json.dumps(tool_metadata, indent=2)}
|
1118
|
+
|
1119
|
+
Current best score: {best_score:.4f}
|
1120
|
+
{history_context}
|
1121
|
+
|
1122
|
+
Generate {self.num_prompts_per_round} improved descriptions for this tool.
|
1123
|
+
Each description should clarify expected input arguments and set explicit expectations
|
1124
|
+
for how the tool output must be used in the final response.
|
1125
|
+
Avoid changing unrelated parts of the prompt. Focus only on the description text for `{tool_name}`.
|
1126
|
+
|
1127
|
+
Return a JSON object of the form:
|
1128
|
+
{{
|
1129
|
+
"prompts": [
|
1130
|
+
{{
|
1131
|
+
"tool_description": "...",
|
1132
|
+
"improvement_focus": "...",
|
1133
|
+
"reasoning": "..."
|
1134
|
+
}}
|
1135
|
+
]
|
1136
|
+
}}
|
1137
|
+
"""
|
1138
|
+
).strip()
|
1139
|
+
|
1140
|
+
with reporting.display_candidate_generation_report(
|
1141
|
+
self.num_prompts_per_round, verbose=self.verbose
|
1142
|
+
) as candidate_generation_report:
|
1143
|
+
try:
|
1144
|
+
content = self._call_model(
|
1145
|
+
project_name,
|
1146
|
+
messages=[
|
1147
|
+
{"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
|
1148
|
+
{"role": "user", "content": instruction},
|
1149
|
+
],
|
1150
|
+
is_reasoning=True,
|
1151
|
+
optimization_id=optimization_id,
|
1152
|
+
)
|
1153
|
+
|
1154
|
+
try:
|
1155
|
+
json_result = json.loads(content)
|
1156
|
+
except json.JSONDecodeError:
|
1157
|
+
import re
|
1158
|
+
|
1159
|
+
json_match = re.search(r"\{.*\}", content, re.DOTALL)
|
1160
|
+
if not json_match:
|
1161
|
+
raise ValueError("No JSON object found in reasoning output")
|
1162
|
+
json_result = json.loads(json_match.group())
|
1163
|
+
|
1164
|
+
prompts_payload = json_result.get("prompts")
|
1165
|
+
if not isinstance(prompts_payload, list):
|
1166
|
+
raise ValueError("Reasoning output missing 'prompts' list")
|
1167
|
+
|
1168
|
+
candidate_generation_report.set_generated_prompts()
|
1169
|
+
|
1170
|
+
candidates: list[chat_prompt.ChatPrompt] = []
|
1171
|
+
for item in prompts_payload:
|
1172
|
+
if not isinstance(item, dict):
|
1173
|
+
continue
|
1174
|
+
description = item.get("tool_description")
|
1175
|
+
if not isinstance(description, str) or not description.strip():
|
1176
|
+
continue
|
1177
|
+
|
1178
|
+
updated_prompt = apply_segment_updates(
|
1179
|
+
current_prompt,
|
1180
|
+
{tool_segment_id: description.strip()},
|
1181
|
+
)
|
1182
|
+
_sync_tool_description_in_system(updated_prompt)
|
1183
|
+
if (
|
1184
|
+
description.strip()
|
1185
|
+
and description.strip() != current_description.strip()
|
1186
|
+
):
|
1187
|
+
reporting.display_tool_description(
|
1188
|
+
description.strip(),
|
1189
|
+
f"Round {round_num + 1} tool description",
|
1190
|
+
panel_style,
|
1191
|
+
)
|
1192
|
+
candidates.append(updated_prompt)
|
1193
|
+
|
1194
|
+
if not candidates:
|
1195
|
+
raise ValueError(
|
1196
|
+
"Reasoning output did not produce valid tool descriptions"
|
1197
|
+
)
|
1198
|
+
|
1199
|
+
return candidates
|
1200
|
+
except Exception as exc:
|
1201
|
+
raise ValueError(f"Error generating MCP prompt candidates: {exc}")
|
1202
|
+
|
1203
|
+
def _build_history_context(self, previous_rounds: list[OptimizationRound]) -> str:
|
874
1204
|
"""Build context from previous optimization rounds."""
|
875
1205
|
if not previous_rounds:
|
876
1206
|
return ""
|
@@ -896,7 +1226,7 @@ class MetaPromptOptimizer(BaseOptimizer):
|
|
896
1226
|
|
897
1227
|
def _get_evaluation_subset(
|
898
1228
|
self, dataset: opik.Dataset, min_size: int = 20, max_size: int = 100
|
899
|
-
) ->
|
1229
|
+
) -> list[dict[str, Any]]:
|
900
1230
|
"""Get a random subset of the dataset for evaluation.
|
901
1231
|
|
902
1232
|
Returns:
|