opik-optimizer 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +28 -11
  4. opik_optimizer/colbert.py +236 -0
  5. opik_optimizer/data/context7_eval.jsonl +3 -0
  6. opik_optimizer/datasets/context7_eval.py +90 -0
  7. opik_optimizer/datasets/tiny_test.py +33 -34
  8. opik_optimizer/datasets/truthful_qa.py +2 -2
  9. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  10. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
  11. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
  12. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  13. opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +152 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +20 -20
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +16 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +21 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/{utils.py → utils/core.py} +111 -26
  46. opik_optimizer/utils/dataset_utils.py +49 -0
  47. opik_optimizer/utils/prompt_segments.py +186 -0
  48. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
  49. opik_optimizer-1.1.0.dist-info/RECORD +73 -0
  50. opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
  51. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  52. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  53. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
  54. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,10 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
4
  import os
4
- from typing import Any, Callable, Dict, List, Optional, Tuple, Type
5
+ import textwrap
6
+ from typing import Any, cast
7
+ from collections.abc import Callable
5
8
 
6
9
  import litellm
7
10
  import opik
@@ -13,7 +16,7 @@ from opik.environment import get_tqdm_for_current_environment
13
16
  from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
14
17
 
15
18
  from opik_optimizer import task_evaluator
16
- from opik_optimizer import utils
19
+ from ..utils.core import create_litellm_agent_class
17
20
 
18
21
  from .. import _throttle
19
22
  from ..base_optimizer import BaseOptimizer, OptimizationRound
@@ -21,6 +24,15 @@ from ..optimization_config import chat_prompt, mappers
21
24
  from ..optimization_result import OptimizationResult
22
25
  from ..optimizable_agent import OptimizableAgent
23
26
  from . import reporting
27
+ import re
28
+
29
+ from ..mcp_utils.mcp import PROMPT_TOOL_FOOTER, PROMPT_TOOL_HEADER
30
+ from ..mcp_utils.mcp_workflow import (
31
+ MCPExecutionConfig,
32
+ MCPSecondPassCoordinator,
33
+ extract_tool_arguments,
34
+ )
35
+ from ..utils.prompt_segments import apply_segment_updates, extract_prompt_segments
24
36
 
25
37
  tqdm = get_tqdm_for_current_environment()
26
38
 
@@ -34,6 +46,48 @@ logger = logging.getLogger(__name__) # Gets logger configured by setup_logging
34
46
  _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
35
47
 
36
48
 
49
+ def _sync_tool_description_in_system(prompt: chat_prompt.ChatPrompt) -> None:
50
+ if not prompt.system or not getattr(prompt, "tools", None):
51
+ return
52
+
53
+ description = (
54
+ prompt.tools[0].get("function", {}).get("description") if prompt.tools else None
55
+ )
56
+ if not description:
57
+ return
58
+
59
+ tool_name = (
60
+ prompt.tools[0].get("function", {}).get("name") if prompt.tools else None
61
+ )
62
+
63
+ system_text = cast(str, prompt.system)
64
+ if PROMPT_TOOL_HEADER not in system_text or PROMPT_TOOL_FOOTER not in system_text:
65
+ return
66
+
67
+ start = system_text.index(PROMPT_TOOL_HEADER) + len(PROMPT_TOOL_HEADER)
68
+ end = system_text.index(PROMPT_TOOL_FOOTER)
69
+ description_text = description.strip()
70
+ system_text = (
71
+ system_text[:start] + "\n" + description_text + "\n" + system_text[end:]
72
+ )
73
+ prompt.system = system_text
74
+
75
+ if tool_name:
76
+ pattern = rf"(-\s*{re.escape(tool_name)}:\s)(.*)"
77
+
78
+ def _tool_section_replacer(match: re.Match[str]) -> str:
79
+ return f"{match.group(1)}{description_text}"
80
+
81
+ system_text = re.sub(
82
+ pattern,
83
+ _tool_section_replacer,
84
+ system_text,
85
+ count=1,
86
+ flags=re.MULTILINE,
87
+ )
88
+ prompt.system = system_text
89
+
90
+
37
91
  class MetaPromptOptimizer(BaseOptimizer):
38
92
  """
39
93
  The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
@@ -82,10 +136,10 @@ class MetaPromptOptimizer(BaseOptimizer):
82
136
  def __init__(
83
137
  self,
84
138
  model: str,
85
- reasoning_model: Optional[str] = None,
139
+ reasoning_model: str | None = None,
86
140
  rounds: int = DEFAULT_ROUNDS,
87
141
  num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
88
- num_threads: Optional[int] = None,
142
+ num_threads: int | None = None,
89
143
  verbose: int = 1,
90
144
  enable_context: bool = True,
91
145
  n_threads: int = 12,
@@ -116,7 +170,7 @@ class MetaPromptOptimizer(BaseOptimizer):
116
170
  print("num_threads is deprecated; use n_threads instead")
117
171
  n_threads = num_threads
118
172
  self.num_threads = n_threads
119
- self.dataset: Optional[Dataset] = None
173
+ self.dataset: Dataset | None = None
120
174
  self._opik_client = opik_client.get_client_cached()
121
175
  self.llm_call_counter = 0
122
176
  self.enable_context = enable_context
@@ -131,9 +185,9 @@ class MetaPromptOptimizer(BaseOptimizer):
131
185
  def _call_model(
132
186
  self,
133
187
  project_name: str,
134
- messages: List[Dict[str, str]],
188
+ messages: list[dict[str, str]],
135
189
  is_reasoning: bool = False,
136
- optimization_id: Optional[str] = None,
190
+ optimization_id: str | None = None,
137
191
  ) -> str:
138
192
  """Call the model with the given prompt and return the response."""
139
193
  self.llm_call_counter += 1
@@ -163,7 +217,7 @@ class MetaPromptOptimizer(BaseOptimizer):
163
217
  }
164
218
 
165
219
  # Prepare metadata that we want to be part of the LLM call context.
166
- metadata_for_opik: Dict[str, Any] = {}
220
+ metadata_for_opik: dict[str, Any] = {}
167
221
  if project_name:
168
222
  metadata_for_opik["project_name"] = (
169
223
  project_name # Top-level for general use
@@ -225,11 +279,12 @@ class MetaPromptOptimizer(BaseOptimizer):
225
279
  prompt: chat_prompt.ChatPrompt,
226
280
  dataset: opik.Dataset,
227
281
  metric: Callable,
228
- n_samples: Optional[int] = None,
229
- dataset_item_ids: Optional[List[str]] = None,
230
- experiment_config: Optional[Dict] = None,
282
+ n_samples: int | None = None,
283
+ dataset_item_ids: list[str] | None = None,
284
+ experiment_config: dict | None = None,
231
285
  use_full_dataset: bool = True,
232
- optimization_id: Optional[str] = None,
286
+ optimization_id: str | None = None,
287
+ mcp_config: MCPExecutionConfig | None = None,
233
288
  **kwargs: Any,
234
289
  ) -> float:
235
290
  """
@@ -277,6 +332,7 @@ class MetaPromptOptimizer(BaseOptimizer):
277
332
  "dataset": dataset.name,
278
333
  "configuration": {
279
334
  "prompt": prompt.get_messages(),
335
+ "tools": getattr(prompt, "tools", None),
280
336
  "n_samples": subset_size,
281
337
  "use_full_dataset": use_full_dataset,
282
338
  },
@@ -285,39 +341,80 @@ class MetaPromptOptimizer(BaseOptimizer):
285
341
  if optimization_id:
286
342
  experiment_config["optimization_id"] = optimization_id
287
343
 
288
- def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
289
- # --- Step 1: Prepare the prompt for the LLM ---
290
- # messages = [
291
- # {
292
- # "role": item["role"],
293
- # "content": item["content"].format(**dataset_item),
294
- # }
295
- # for item in prompt.get_messages()
296
- # ]
297
- # Step 1: create the agent
344
+ def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
298
345
  new_prompt = prompt.copy()
299
346
  messages = new_prompt.get_messages(dataset_item)
300
347
  new_prompt.set_messages(messages)
301
348
  agent = self.agent_class(new_prompt)
302
349
 
303
- # --- Step 2: Call the model ---
304
- try:
305
- logger.debug(
306
- f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
307
- )
308
- raw_model_output = agent.invoke(messages)
309
- logger.debug(f"LLM raw response length: {len(raw_model_output)}")
310
- logger.debug(f"LLM raw output: {raw_model_output}")
311
- except Exception as e:
312
- logger.error(f"Error calling model with prompt: {e}")
313
- logger.error(f"Failed prompt: {messages}")
314
- logger.error(
315
- f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
350
+ if mcp_config is not None:
351
+ coordinator = mcp_config.coordinator
352
+ coordinator.reset()
353
+ try:
354
+ logger.debug(
355
+ "Calling MCP-enabled LLM with tool access; prompt length=%s",
356
+ sum(len(msg["content"]) for msg in messages),
357
+ )
358
+ raw_model_output = agent.llm_invoke(
359
+ messages=messages,
360
+ seed=None,
361
+ allow_tool_use=True,
362
+ )
363
+ except Exception as exc:
364
+ logger.error("Error during MCP first pass: %s", exc)
365
+ raise
366
+
367
+ second_pass_messages = coordinator.build_second_pass_messages(
368
+ base_messages=messages,
369
+ dataset_item=dataset_item,
316
370
  )
317
- raise
318
371
 
319
- # --- Step 3: Clean the model's output before metric evaluation ---
320
- cleaned_model_output = raw_model_output.strip()
372
+ if second_pass_messages is None and mcp_config.fallback_invoker:
373
+ fallback_args = mcp_config.fallback_arguments(dataset_item)
374
+ if fallback_args:
375
+ logger.debug(
376
+ "MCP fallback triggered for tool %s with args=%s",
377
+ mcp_config.tool_name,
378
+ fallback_args,
379
+ )
380
+ summary_override = mcp_config.fallback_invoker(fallback_args)
381
+ second_pass_messages = coordinator.build_second_pass_messages(
382
+ base_messages=messages,
383
+ dataset_item=dataset_item,
384
+ summary_override=summary_override,
385
+ )
386
+
387
+ if second_pass_messages is not None:
388
+ logger.debug(
389
+ "Executing MCP second pass with %d messages",
390
+ len(second_pass_messages),
391
+ )
392
+ final_response = agent.llm_invoke(
393
+ messages=second_pass_messages,
394
+ seed=None,
395
+ allow_tool_use=mcp_config.allow_tool_use_on_second_pass,
396
+ )
397
+ else:
398
+ final_response = raw_model_output
399
+
400
+ cleaned_model_output = final_response.strip()
401
+ else:
402
+ try:
403
+ logger.debug(
404
+ f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
405
+ )
406
+ raw_model_output = agent.invoke(messages)
407
+ logger.debug(f"LLM raw response length: {len(raw_model_output)}")
408
+ logger.debug(f"LLM raw output: {raw_model_output}")
409
+ except Exception as e:
410
+ logger.error(f"Error calling model with prompt: {e}")
411
+ logger.error(f"Failed prompt: {messages}")
412
+ logger.error(
413
+ f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
414
+ )
415
+ raise
416
+
417
+ cleaned_model_output = raw_model_output.strip()
321
418
 
322
419
  result = {
323
420
  mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
@@ -348,12 +445,16 @@ class MetaPromptOptimizer(BaseOptimizer):
348
445
  prompt: chat_prompt.ChatPrompt,
349
446
  dataset: Dataset,
350
447
  metric: Callable,
351
- experiment_config: Optional[Dict] = None,
352
- n_samples: Optional[int] = None,
448
+ experiment_config: dict | None = None,
449
+ n_samples: int | None = None,
353
450
  auto_continue: bool = False,
354
- agent_class: Optional[Type[OptimizableAgent]] = None,
451
+ agent_class: type[OptimizableAgent] | None = None,
355
452
  **kwargs: Any,
356
453
  ) -> OptimizationResult:
454
+ mcp_config = kwargs.pop("mcp_config", None)
455
+ candidate_generator = kwargs.pop("candidate_generator", None)
456
+ candidate_generator_kwargs = kwargs.pop("candidate_generator_kwargs", None)
457
+
357
458
  """
358
459
  Optimize a prompt using meta-reasoning.
359
460
 
@@ -385,7 +486,7 @@ class MetaPromptOptimizer(BaseOptimizer):
385
486
  prompt.model_kwargs = self.model_kwargs
386
487
 
387
488
  if agent_class is None:
388
- self.agent_class = utils.create_litellm_agent_class(prompt)
489
+ self.agent_class = create_litellm_agent_class(prompt)
389
490
  else:
390
491
  self.agent_class = agent_class
391
492
 
@@ -424,6 +525,7 @@ class MetaPromptOptimizer(BaseOptimizer):
424
525
  "auto_continue": auto_continue,
425
526
  },
426
527
  verbose=self.verbose,
528
+ tools=getattr(prompt, "tools", None),
427
529
  )
428
530
 
429
531
  try:
@@ -436,6 +538,9 @@ class MetaPromptOptimizer(BaseOptimizer):
436
538
  experiment_config=experiment_config,
437
539
  n_samples=n_samples,
438
540
  auto_continue=auto_continue,
541
+ mcp_config=mcp_config,
542
+ candidate_generator=candidate_generator,
543
+ candidate_generator_kwargs=candidate_generator_kwargs,
439
544
  **kwargs,
440
545
  )
441
546
  if optimization:
@@ -449,15 +554,80 @@ class MetaPromptOptimizer(BaseOptimizer):
449
554
  logger.debug("Optimization marked as cancelled")
450
555
  raise e
451
556
 
557
+ def optimize_mcp(
558
+ self,
559
+ prompt: chat_prompt.ChatPrompt,
560
+ dataset: Dataset,
561
+ metric: Callable,
562
+ *,
563
+ tool_name: str,
564
+ second_pass: MCPSecondPassCoordinator,
565
+ experiment_config: dict | None = None,
566
+ n_samples: int | None = None,
567
+ auto_continue: bool = False,
568
+ agent_class: type[OptimizableAgent] | None = None,
569
+ fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
570
+ fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
571
+ allow_tool_use_on_second_pass: bool = False,
572
+ **kwargs: Any,
573
+ ) -> OptimizationResult:
574
+ panel_style = kwargs.pop("tool_panel_style", "bright_magenta")
575
+
576
+ if prompt.tools is None or not prompt.tools:
577
+ raise ValueError("Prompt must include tools for MCP optimization")
578
+
579
+ fallback_args_fn = fallback_arguments or extract_tool_arguments
580
+
581
+ if fallback_invoker is None:
582
+ function_map = prompt.function_map or {}
583
+ fallback_invoker = function_map.get(tool_name)
584
+
585
+ mcp_config = MCPExecutionConfig(
586
+ coordinator=second_pass,
587
+ tool_name=tool_name,
588
+ fallback_arguments=fallback_args_fn,
589
+ fallback_invoker=fallback_invoker,
590
+ allow_tool_use_on_second_pass=allow_tool_use_on_second_pass,
591
+ )
592
+
593
+ tool_segment_id = f"tool:{tool_name}"
594
+ segments = extract_prompt_segments(prompt)
595
+ if tool_segment_id not in {segment.segment_id for segment in segments}:
596
+ raise ValueError(f"Tool '{tool_name}' not present in prompt tools")
597
+
598
+ return self.optimize_prompt(
599
+ prompt=prompt,
600
+ dataset=dataset,
601
+ metric=metric,
602
+ experiment_config=experiment_config,
603
+ n_samples=n_samples,
604
+ auto_continue=auto_continue,
605
+ agent_class=agent_class,
606
+ mcp_config=mcp_config,
607
+ candidate_generator=self._generate_mcp_candidate_prompts,
608
+ candidate_generator_kwargs={
609
+ "tool_segment_id": tool_segment_id,
610
+ "tool_name": tool_name,
611
+ "panel_style": panel_style,
612
+ },
613
+ tool_panel_style=panel_style,
614
+ **kwargs,
615
+ )
616
+
452
617
  def _optimize_prompt(
453
618
  self,
454
- optimization_id: Optional[str],
619
+ optimization_id: str | None,
455
620
  prompt: chat_prompt.ChatPrompt,
456
621
  dataset: Dataset,
457
622
  metric: Callable,
458
- experiment_config: Optional[Dict],
459
- n_samples: Optional[int],
623
+ experiment_config: dict | None,
624
+ n_samples: int | None,
460
625
  auto_continue: bool,
626
+ mcp_config: MCPExecutionConfig | None = None,
627
+ candidate_generator: None
628
+ | (Callable[..., list[chat_prompt.ChatPrompt]]) = None,
629
+ candidate_generator_kwargs: dict[str, Any] | None = None,
630
+ tool_panel_style: str = "bright_magenta",
461
631
  **kwargs: Any,
462
632
  ) -> OptimizationResult:
463
633
  self.auto_continue = auto_continue
@@ -494,10 +664,11 @@ class MetaPromptOptimizer(BaseOptimizer):
494
664
  experiment_config=experiment_config,
495
665
  use_full_dataset=n_samples is None,
496
666
  verbose=self.verbose,
667
+ mcp_config=mcp_config,
497
668
  )
498
669
  best_score = initial_score
499
670
  best_prompt = current_prompt
500
- rounds: List[OptimizationRound] = []
671
+ rounds: list[OptimizationRound] = []
501
672
 
502
673
  baseline_reporter.set_score(initial_score)
503
674
 
@@ -510,8 +681,11 @@ class MetaPromptOptimizer(BaseOptimizer):
510
681
  previous_best_score = best_score
511
682
 
512
683
  # Step 1. Create a set of candidate prompts
684
+ generator = candidate_generator or self._generate_candidate_prompts
685
+ generator_kwargs = dict(candidate_generator_kwargs or {})
686
+
513
687
  try:
514
- candidate_prompts = self._generate_candidate_prompts(
688
+ candidate_prompts = generator(
515
689
  project_name=self.agent_class.project_name,
516
690
  current_prompt=best_prompt,
517
691
  best_score=best_score,
@@ -519,25 +693,25 @@ class MetaPromptOptimizer(BaseOptimizer):
519
693
  previous_rounds=rounds,
520
694
  metric=metric,
521
695
  optimization_id=optimization_id,
696
+ **generator_kwargs,
522
697
  )
523
698
  except Exception as e:
524
699
  round_reporter.failed_to_generate(self.num_prompts_per_round, e)
525
700
  continue
526
701
 
527
702
  # Step 2. Score each candidate prompt
528
- prompt_scores: List[Tuple[chat_prompt.ChatPrompt, float]] = []
703
+ prompt_scores: list[tuple[chat_prompt.ChatPrompt, float]] = []
529
704
  for candidate_count, prompt in enumerate(candidate_prompts):
530
705
  with reporting.display_prompt_candidate_scoring_report(
531
706
  verbose=self.verbose
532
707
  ) as eval_report:
533
708
  eval_report.set_generated_prompts(candidate_count, prompt)
534
709
 
535
- new_prompt = current_prompt.copy()
536
- new_prompt.set_messages(prompt.get_messages())
710
+ candidate_prompt = prompt.copy()
537
711
 
538
712
  try:
539
713
  prompt_score = self._evaluate_prompt(
540
- prompt=new_prompt,
714
+ prompt=candidate_prompt,
541
715
  optimization_id=optimization_id,
542
716
  dataset=dataset,
543
717
  metric=metric,
@@ -545,11 +719,12 @@ class MetaPromptOptimizer(BaseOptimizer):
545
719
  use_full_dataset=False,
546
720
  experiment_config=experiment_config,
547
721
  verbose=self.verbose,
722
+ mcp_config=mcp_config,
548
723
  )
549
724
 
550
725
  eval_report.set_final_score(best_score, prompt_score)
551
726
  except Exception:
552
- print("Failed evaluating agent; continuing...")
727
+ logger.warning("Failed evaluating agent; continuing...")
553
728
  prompt_score = 0
554
729
 
555
730
  prompt_scores.append((prompt, prompt_score))
@@ -584,24 +759,39 @@ class MetaPromptOptimizer(BaseOptimizer):
584
759
  best_score = best_cand_score_avg
585
760
  best_prompt = best_candidate_this_round
586
761
 
762
+ if tool_panel_style and getattr(best_prompt, "tools", None):
763
+ description = (
764
+ best_prompt.tools[0].get("function", {}).get("description", "")
765
+ if best_prompt.tools
766
+ else ""
767
+ )
768
+ if description.strip():
769
+ reporting.display_tool_description(
770
+ description.strip(),
771
+ "Final tool description",
772
+ tool_panel_style,
773
+ )
774
+
587
775
  reporting.display_result(
588
776
  initial_score,
589
777
  best_score,
590
778
  best_prompt.get_messages() if best_prompt is not None else [],
591
779
  verbose=self.verbose,
780
+ tools=getattr(best_prompt, "tools", None) if best_prompt else None,
592
781
  )
593
782
 
594
783
  return self._create_result(
595
784
  metric,
596
- initial_prompt=initial_prompt.get_messages()
597
- if initial_prompt is not None
598
- else [],
785
+ initial_prompt=(
786
+ initial_prompt.get_messages() if initial_prompt is not None else []
787
+ ),
599
788
  best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
600
789
  best_score=best_score,
601
790
  initial_score=initial_score,
602
791
  rounds=rounds,
603
792
  dataset_id=dataset.id,
604
793
  optimization_id=optimization_id,
794
+ best_tools=getattr(best_prompt, "tools", None) if best_prompt else None,
605
795
  )
606
796
 
607
797
  def _calculate_improvement(
@@ -620,19 +810,24 @@ class MetaPromptOptimizer(BaseOptimizer):
620
810
  current_best_prompt: chat_prompt.ChatPrompt,
621
811
  current_best_score: float,
622
812
  best_prompt_overall: chat_prompt.ChatPrompt,
623
- evaluated_candidates: List[Tuple[chat_prompt.ChatPrompt, float]],
813
+ evaluated_candidates: list[tuple[chat_prompt.ChatPrompt, float]],
624
814
  previous_best_score: float,
625
815
  improvement_this_round: float,
626
816
  ) -> OptimizationRound:
627
817
  """Create an OptimizationRound object with the current round's data."""
628
- generated_prompts_log = []
818
+ generated_prompts_log: list[dict[str, Any]] = []
629
819
  for prompt, score in evaluated_candidates:
630
820
  improvement_vs_prev = self._calculate_improvement(
631
821
  score, previous_best_score
632
822
  )
823
+ tool_entries: list[Any] = []
824
+ if getattr(prompt, "tools", None):
825
+ tool_entries = copy.deepcopy(list(prompt.tools or []))
826
+
633
827
  generated_prompts_log.append(
634
828
  {
635
829
  "prompt": prompt.get_messages(),
830
+ "tools": tool_entries,
636
831
  "score": score,
637
832
  "improvement": improvement_vs_prev,
638
833
  }
@@ -651,13 +846,14 @@ class MetaPromptOptimizer(BaseOptimizer):
651
846
  def _create_result(
652
847
  self,
653
848
  metric: Callable,
654
- initial_prompt: List[Dict[str, str]],
655
- best_prompt: List[Dict[str, str]],
849
+ initial_prompt: list[dict[str, str]],
850
+ best_prompt: list[dict[str, str]],
656
851
  best_score: float,
657
852
  initial_score: float,
658
- rounds: List[OptimizationRound],
659
- dataset_id: Optional[str],
660
- optimization_id: Optional[str],
853
+ rounds: list[OptimizationRound],
854
+ dataset_id: str | None,
855
+ optimization_id: str | None,
856
+ best_tools: list[dict[str, Any]] | None,
661
857
  ) -> OptimizationResult:
662
858
  """Create the final OptimizationResult object."""
663
859
  details = {
@@ -670,6 +866,18 @@ class MetaPromptOptimizer(BaseOptimizer):
670
866
  "temperature": self.model_kwargs.get("temperature"),
671
867
  }
672
868
 
869
+ if best_tools:
870
+ details["final_tools"] = best_tools
871
+
872
+ tool_prompts = None
873
+ if best_tools:
874
+ tool_prompts = {
875
+ (tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
876
+ "function", {}
877
+ ).get("description")
878
+ for idx, tool in enumerate(best_tools)
879
+ }
880
+
673
881
  return OptimizationResult(
674
882
  optimizer=self.__class__.__name__,
675
883
  prompt=best_prompt,
@@ -681,6 +889,7 @@ class MetaPromptOptimizer(BaseOptimizer):
681
889
  llm_calls=self.llm_call_counter,
682
890
  dataset_id=dataset_id,
683
891
  optimization_id=optimization_id,
892
+ tool_prompts=tool_prompts,
684
893
  )
685
894
 
686
895
  def _get_task_context(self, metric: Callable) -> str:
@@ -714,11 +923,11 @@ class MetaPromptOptimizer(BaseOptimizer):
714
923
  current_prompt: chat_prompt.ChatPrompt,
715
924
  best_score: float,
716
925
  round_num: int,
717
- previous_rounds: List[OptimizationRound],
926
+ previous_rounds: list[OptimizationRound],
718
927
  metric: Callable,
719
- optimization_id: Optional[str] = None,
720
- project_name: Optional[str] = None,
721
- ) -> List[chat_prompt.ChatPrompt]:
928
+ optimization_id: str | None = None,
929
+ project_name: str | None = None,
930
+ ) -> list[chat_prompt.ChatPrompt]:
722
931
  """Generate candidate prompts using meta-prompting."""
723
932
  with reporting.display_candidate_generation_report(
724
933
  self.num_prompts_per_round, verbose=self.verbose
@@ -819,7 +1028,7 @@ class MetaPromptOptimizer(BaseOptimizer):
819
1028
  )
820
1029
 
821
1030
  # Extract and log valid prompts
822
- valid_prompts: List[chat_prompt.ChatPrompt] = []
1031
+ valid_prompts: list[chat_prompt.ChatPrompt] = []
823
1032
  for item in json_result["prompts"]:
824
1033
  if (
825
1034
  isinstance(item, dict)
@@ -870,7 +1079,128 @@ class MetaPromptOptimizer(BaseOptimizer):
870
1079
  f"Unexpected error during candidate prompt generation: {e}"
871
1080
  )
872
1081
 
873
- def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
1082
+ def _generate_mcp_candidate_prompts(
1083
+ self,
1084
+ current_prompt: chat_prompt.ChatPrompt,
1085
+ best_score: float,
1086
+ round_num: int,
1087
+ previous_rounds: list[OptimizationRound],
1088
+ metric: Callable,
1089
+ tool_segment_id: str,
1090
+ tool_name: str,
1091
+ optimization_id: str | None = None,
1092
+ project_name: str | None = None,
1093
+ panel_style: str = "bright_magenta",
1094
+ ) -> list[chat_prompt.ChatPrompt]:
1095
+ segments = {
1096
+ segment.segment_id: segment
1097
+ for segment in extract_prompt_segments(current_prompt)
1098
+ }
1099
+ if tool_segment_id not in segments:
1100
+ raise ValueError(f"Tool segment '{tool_segment_id}' not found in prompt")
1101
+
1102
+ target_segment = segments[tool_segment_id]
1103
+ current_description = target_segment.content
1104
+ tool_metadata = target_segment.metadata.get("raw_tool", {})
1105
+
1106
+ history_context = self._build_history_context(previous_rounds)
1107
+
1108
+ instruction = textwrap.dedent(
1109
+ f"""
1110
+ Current tool name: {tool_name}
1111
+ Current tool description:
1112
+ ---
1113
+ {current_description}
1114
+ ---
1115
+
1116
+ Tool metadata (JSON):
1117
+ {json.dumps(tool_metadata, indent=2)}
1118
+
1119
+ Current best score: {best_score:.4f}
1120
+ {history_context}
1121
+
1122
+ Generate {self.num_prompts_per_round} improved descriptions for this tool.
1123
+ Each description should clarify expected input arguments and set explicit expectations
1124
+ for how the tool output must be used in the final response.
1125
+ Avoid changing unrelated parts of the prompt. Focus only on the description text for `{tool_name}`.
1126
+
1127
+ Return a JSON object of the form:
1128
+ {{
1129
+ "prompts": [
1130
+ {{
1131
+ "tool_description": "...",
1132
+ "improvement_focus": "...",
1133
+ "reasoning": "..."
1134
+ }}
1135
+ ]
1136
+ }}
1137
+ """
1138
+ ).strip()
1139
+
1140
+ with reporting.display_candidate_generation_report(
1141
+ self.num_prompts_per_round, verbose=self.verbose
1142
+ ) as candidate_generation_report:
1143
+ try:
1144
+ content = self._call_model(
1145
+ project_name,
1146
+ messages=[
1147
+ {"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
1148
+ {"role": "user", "content": instruction},
1149
+ ],
1150
+ is_reasoning=True,
1151
+ optimization_id=optimization_id,
1152
+ )
1153
+
1154
+ try:
1155
+ json_result = json.loads(content)
1156
+ except json.JSONDecodeError:
1157
+ import re
1158
+
1159
+ json_match = re.search(r"\{.*\}", content, re.DOTALL)
1160
+ if not json_match:
1161
+ raise ValueError("No JSON object found in reasoning output")
1162
+ json_result = json.loads(json_match.group())
1163
+
1164
+ prompts_payload = json_result.get("prompts")
1165
+ if not isinstance(prompts_payload, list):
1166
+ raise ValueError("Reasoning output missing 'prompts' list")
1167
+
1168
+ candidate_generation_report.set_generated_prompts()
1169
+
1170
+ candidates: list[chat_prompt.ChatPrompt] = []
1171
+ for item in prompts_payload:
1172
+ if not isinstance(item, dict):
1173
+ continue
1174
+ description = item.get("tool_description")
1175
+ if not isinstance(description, str) or not description.strip():
1176
+ continue
1177
+
1178
+ updated_prompt = apply_segment_updates(
1179
+ current_prompt,
1180
+ {tool_segment_id: description.strip()},
1181
+ )
1182
+ _sync_tool_description_in_system(updated_prompt)
1183
+ if (
1184
+ description.strip()
1185
+ and description.strip() != current_description.strip()
1186
+ ):
1187
+ reporting.display_tool_description(
1188
+ description.strip(),
1189
+ f"Round {round_num + 1} tool description",
1190
+ panel_style,
1191
+ )
1192
+ candidates.append(updated_prompt)
1193
+
1194
+ if not candidates:
1195
+ raise ValueError(
1196
+ "Reasoning output did not produce valid tool descriptions"
1197
+ )
1198
+
1199
+ return candidates
1200
+ except Exception as exc:
1201
+ raise ValueError(f"Error generating MCP prompt candidates: {exc}")
1202
+
1203
+ def _build_history_context(self, previous_rounds: list[OptimizationRound]) -> str:
874
1204
  """Build context from previous optimization rounds."""
875
1205
  if not previous_rounds:
876
1206
  return ""
@@ -896,7 +1226,7 @@ class MetaPromptOptimizer(BaseOptimizer):
896
1226
 
897
1227
  def _get_evaluation_subset(
898
1228
  self, dataset: opik.Dataset, min_size: int = 20, max_size: int = 100
899
- ) -> List[Dict[str, Any]]:
1229
+ ) -> list[dict[str, Any]]:
900
1230
  """Get a random subset of the dataset for evaluation.
901
1231
 
902
1232
  Returns: