opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. opik_optimizer/__init__.py +4 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +402 -28
  4. opik_optimizer/data/context7_eval.jsonl +3 -0
  5. opik_optimizer/datasets/context7_eval.py +90 -0
  6. opik_optimizer/datasets/tiny_test.py +33 -34
  7. opik_optimizer/datasets/truthful_qa.py +2 -2
  8. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  9. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
  10. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
  11. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  12. opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
  13. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +154 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +30 -23
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +21 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +22 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/utils/colbert.py +236 -0
  46. opik_optimizer/{utils.py → utils/core.py} +160 -33
  47. opik_optimizer/utils/dataset_utils.py +49 -0
  48. opik_optimizer/utils/prompt_segments.py +186 -0
  49. opik_optimizer-2.0.0.dist-info/METADATA +345 -0
  50. opik_optimizer-2.0.0.dist-info/RECORD +74 -0
  51. opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
  52. opik_optimizer-1.0.6.dist-info/METADATA +0 -181
  53. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  54. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  55. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
  56. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,21 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
4
  import os
4
- from typing import Any, Callable, Dict, List, Optional, Tuple, Type
5
+ import textwrap
6
+ import warnings
7
+ from typing import Any, cast
8
+ from collections.abc import Callable
5
9
 
6
10
  import litellm
7
11
  import opik
8
12
  from litellm.caching import Cache
9
13
  from litellm.types.caching import LiteLLMCacheType
10
14
  from opik import Dataset
11
- from opik.api_objects import opik_client
12
15
  from opik.environment import get_tqdm_for_current_environment
13
16
  from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
14
17
 
15
18
  from opik_optimizer import task_evaluator
16
- from opik_optimizer import utils
17
19
 
18
20
  from .. import _throttle
19
21
  from ..base_optimizer import BaseOptimizer, OptimizationRound
@@ -21,6 +23,15 @@ from ..optimization_config import chat_prompt, mappers
21
23
  from ..optimization_result import OptimizationResult
22
24
  from ..optimizable_agent import OptimizableAgent
23
25
  from . import reporting
26
+ import re
27
+
28
+ from ..mcp_utils.mcp import PROMPT_TOOL_FOOTER, PROMPT_TOOL_HEADER
29
+ from ..mcp_utils.mcp_workflow import (
30
+ MCPExecutionConfig,
31
+ MCPSecondPassCoordinator,
32
+ extract_tool_arguments,
33
+ )
34
+ from ..utils.prompt_segments import apply_segment_updates, extract_prompt_segments
24
35
 
25
36
  tqdm = get_tqdm_for_current_environment()
26
37
 
@@ -34,6 +45,48 @@ logger = logging.getLogger(__name__) # Gets logger configured by setup_logging
34
45
  _rate_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
35
46
 
36
47
 
48
+ def _sync_tool_description_in_system(prompt: chat_prompt.ChatPrompt) -> None:
49
+ if not prompt.system or not getattr(prompt, "tools", None):
50
+ return
51
+
52
+ description = (
53
+ prompt.tools[0].get("function", {}).get("description") if prompt.tools else None
54
+ )
55
+ if not description:
56
+ return
57
+
58
+ tool_name = (
59
+ prompt.tools[0].get("function", {}).get("name") if prompt.tools else None
60
+ )
61
+
62
+ system_text = cast(str, prompt.system)
63
+ if PROMPT_TOOL_HEADER not in system_text or PROMPT_TOOL_FOOTER not in system_text:
64
+ return
65
+
66
+ start = system_text.index(PROMPT_TOOL_HEADER) + len(PROMPT_TOOL_HEADER)
67
+ end = system_text.index(PROMPT_TOOL_FOOTER)
68
+ description_text = description.strip()
69
+ system_text = (
70
+ system_text[:start] + "\n" + description_text + "\n" + system_text[end:]
71
+ )
72
+ prompt.system = system_text
73
+
74
+ if tool_name:
75
+ pattern = rf"(-\s*{re.escape(tool_name)}:\s)(.*)"
76
+
77
+ def _tool_section_replacer(match: re.Match[str]) -> str:
78
+ return f"{match.group(1)}{description_text}"
79
+
80
+ system_text = re.sub(
81
+ pattern,
82
+ _tool_section_replacer,
83
+ system_text,
84
+ count=1,
85
+ flags=re.MULTILINE,
86
+ )
87
+ prompt.system = system_text
88
+
89
+
37
90
  class MetaPromptOptimizer(BaseOptimizer):
38
91
  """
39
92
  The Meta-Prompt Optimizer uses meta-prompting to improve prompts based on examples and performance.
@@ -82,13 +135,14 @@ class MetaPromptOptimizer(BaseOptimizer):
82
135
  def __init__(
83
136
  self,
84
137
  model: str,
85
- reasoning_model: Optional[str] = None,
138
+ reasoning_model: str | None = None,
86
139
  rounds: int = DEFAULT_ROUNDS,
87
140
  num_prompts_per_round: int = DEFAULT_PROMPTS_PER_ROUND,
88
- num_threads: Optional[int] = None,
141
+ num_threads: int | None = None,
89
142
  verbose: int = 1,
90
143
  enable_context: bool = True,
91
144
  n_threads: int = 12,
145
+ seed: int = 42,
92
146
  **model_kwargs: Any,
93
147
  ) -> None:
94
148
  """
@@ -103,22 +157,28 @@ class MetaPromptOptimizer(BaseOptimizer):
103
157
  **model_kwargs: Additional model parameters
104
158
  """
105
159
  if "project_name" in model_kwargs:
106
- print(
107
- "Removing `project_name` from constructor; it now belongs in the ChatPrompt()"
160
+ warnings.warn(
161
+ "The 'project_name' parameter in optimizer constructor is deprecated. "
162
+ "Set project_name in the ChatPrompt instead.",
163
+ DeprecationWarning,
164
+ stacklevel=2,
108
165
  )
109
166
  del model_kwargs["project_name"]
110
167
 
111
- super().__init__(model=model, verbose=verbose, **model_kwargs)
168
+ super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
112
169
  self.reasoning_model = reasoning_model if reasoning_model is not None else model
113
170
  self.rounds = rounds
114
171
  self.num_prompts_per_round = num_prompts_per_round
115
172
  if num_threads is not None:
116
- print("num_threads is deprecated; use n_threads instead")
173
+ warnings.warn(
174
+ "The 'num_threads' parameter is deprecated and will be removed in a future version. "
175
+ "Use 'n_threads' instead.",
176
+ DeprecationWarning,
177
+ stacklevel=2,
178
+ )
117
179
  n_threads = num_threads
118
180
  self.num_threads = n_threads
119
- self.dataset: Optional[Dataset] = None
120
- self._opik_client = opik_client.get_client_cached()
121
- self.llm_call_counter = 0
181
+ self.dataset: Dataset | None = None
122
182
  self.enable_context = enable_context
123
183
  logger.debug(
124
184
  f"Initialized MetaPromptOptimizer with model={model}, reasoning_model={self.reasoning_model}"
@@ -127,16 +187,24 @@ class MetaPromptOptimizer(BaseOptimizer):
127
187
  f"Optimization rounds: {rounds}, Prompts/round: {num_prompts_per_round}"
128
188
  )
129
189
 
190
+ def get_optimizer_metadata(self) -> dict[str, Any]:
191
+ return {
192
+ "rounds": self.rounds,
193
+ "num_prompts_per_round": self.num_prompts_per_round,
194
+ "reasoning_model": self.reasoning_model,
195
+ "enable_context": self.enable_context,
196
+ }
197
+
130
198
  @_throttle.rate_limited(_rate_limiter)
131
199
  def _call_model(
132
200
  self,
133
201
  project_name: str,
134
- messages: List[Dict[str, str]],
202
+ messages: list[dict[str, str]],
135
203
  is_reasoning: bool = False,
136
- optimization_id: Optional[str] = None,
204
+ optimization_id: str | None = None,
137
205
  ) -> str:
138
206
  """Call the model with the given prompt and return the response."""
139
- self.llm_call_counter += 1
207
+ self.increment_llm_counter()
140
208
  # Note: Basic retry logic could be added here using tenacity
141
209
  try:
142
210
  # Basic LLM parameters (e.g., temperature, max_tokens)
@@ -163,7 +231,7 @@ class MetaPromptOptimizer(BaseOptimizer):
163
231
  }
164
232
 
165
233
  # Prepare metadata that we want to be part of the LLM call context.
166
- metadata_for_opik: Dict[str, Any] = {}
234
+ metadata_for_opik: dict[str, Any] = {}
167
235
  if project_name:
168
236
  metadata_for_opik["project_name"] = (
169
237
  project_name # Top-level for general use
@@ -225,11 +293,12 @@ class MetaPromptOptimizer(BaseOptimizer):
225
293
  prompt: chat_prompt.ChatPrompt,
226
294
  dataset: opik.Dataset,
227
295
  metric: Callable,
228
- n_samples: Optional[int] = None,
229
- dataset_item_ids: Optional[List[str]] = None,
230
- experiment_config: Optional[Dict] = None,
296
+ n_samples: int | None = None,
297
+ dataset_item_ids: list[str] | None = None,
298
+ experiment_config: dict | None = None,
231
299
  use_full_dataset: bool = True,
232
- optimization_id: Optional[str] = None,
300
+ optimization_id: str | None = None,
301
+ mcp_config: MCPExecutionConfig | None = None,
233
302
  **kwargs: Any,
234
303
  ) -> float:
235
304
  """
@@ -266,58 +335,103 @@ class MetaPromptOptimizer(BaseOptimizer):
266
335
  subset_size = None # Use all items for final checks
267
336
  logger.debug("Using full dataset for evaluation")
268
337
 
269
- experiment_config = experiment_config or {}
270
- experiment_config = {
271
- **experiment_config,
272
- **{
273
- "optimizer": self.__class__.__name__,
274
- "agent_class": self.agent_class.__name__,
275
- "agent_config": prompt.to_dict(),
276
- "metric": getattr(metric, "__name__", str(metric)),
277
- "dataset": dataset.name,
278
- "configuration": {
279
- "prompt": prompt.get_messages(),
280
- "n_samples": subset_size,
281
- "use_full_dataset": use_full_dataset,
282
- },
283
- },
284
- }
285
- if optimization_id:
286
- experiment_config["optimization_id"] = optimization_id
287
-
288
- def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
289
- # --- Step 1: Prepare the prompt for the LLM ---
290
- # messages = [
291
- # {
292
- # "role": item["role"],
293
- # "content": item["content"].format(**dataset_item),
294
- # }
295
- # for item in prompt.get_messages()
296
- # ]
297
- # Step 1: create the agent
338
+ configuration_updates = self._drop_none(
339
+ {
340
+ "n_samples": subset_size,
341
+ "use_full_dataset": use_full_dataset,
342
+ }
343
+ )
344
+ meta_metadata = self._drop_none(
345
+ {
346
+ "optimization_id": optimization_id,
347
+ "stage": "trial_evaluation" if not use_full_dataset else "final_eval",
348
+ }
349
+ )
350
+ experiment_config = self._prepare_experiment_config(
351
+ prompt=prompt,
352
+ dataset=dataset,
353
+ metric=metric,
354
+ experiment_config=experiment_config,
355
+ configuration_updates=configuration_updates,
356
+ additional_metadata={"meta_prompt": meta_metadata}
357
+ if meta_metadata
358
+ else None,
359
+ )
360
+
361
+ def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
298
362
  new_prompt = prompt.copy()
299
363
  messages = new_prompt.get_messages(dataset_item)
300
364
  new_prompt.set_messages(messages)
301
365
  agent = self.agent_class(new_prompt)
302
366
 
303
- # --- Step 2: Call the model ---
304
- try:
305
- logger.debug(
306
- f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
307
- )
308
- raw_model_output = agent.invoke(messages)
309
- logger.debug(f"LLM raw response length: {len(raw_model_output)}")
310
- logger.debug(f"LLM raw output: {raw_model_output}")
311
- except Exception as e:
312
- logger.error(f"Error calling model with prompt: {e}")
313
- logger.error(f"Failed prompt: {messages}")
314
- logger.error(
315
- f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
367
+ if mcp_config is not None:
368
+ coordinator = mcp_config.coordinator
369
+ coordinator.reset()
370
+ try:
371
+ logger.debug(
372
+ "Calling MCP-enabled LLM with tool access; prompt length=%s",
373
+ sum(len(msg["content"]) for msg in messages),
374
+ )
375
+ raw_model_output = agent.llm_invoke(
376
+ messages=messages,
377
+ seed=self.seed,
378
+ allow_tool_use=True,
379
+ )
380
+ except Exception as exc:
381
+ logger.error("Error during MCP first pass: %s", exc)
382
+ raise
383
+
384
+ second_pass_messages = coordinator.build_second_pass_messages(
385
+ base_messages=messages,
386
+ dataset_item=dataset_item,
316
387
  )
317
- raise
318
388
 
319
- # --- Step 3: Clean the model's output before metric evaluation ---
320
- cleaned_model_output = raw_model_output.strip()
389
+ if second_pass_messages is None and mcp_config.fallback_invoker:
390
+ fallback_args = mcp_config.fallback_arguments(dataset_item)
391
+ if fallback_args:
392
+ logger.debug(
393
+ "MCP fallback triggered for tool %s with args=%s",
394
+ mcp_config.tool_name,
395
+ fallback_args,
396
+ )
397
+ summary_override = mcp_config.fallback_invoker(fallback_args)
398
+ second_pass_messages = coordinator.build_second_pass_messages(
399
+ base_messages=messages,
400
+ dataset_item=dataset_item,
401
+ summary_override=summary_override,
402
+ )
403
+
404
+ if second_pass_messages is not None:
405
+ logger.debug(
406
+ "Executing MCP second pass with %d messages",
407
+ len(second_pass_messages),
408
+ )
409
+ final_response = agent.llm_invoke(
410
+ messages=second_pass_messages,
411
+ seed=self.seed,
412
+ allow_tool_use=mcp_config.allow_tool_use_on_second_pass,
413
+ )
414
+ else:
415
+ final_response = raw_model_output
416
+
417
+ cleaned_model_output = final_response.strip()
418
+ else:
419
+ try:
420
+ logger.debug(
421
+ f"Calling LLM with prompt length: {sum(len(msg['content']) for msg in messages)}"
422
+ )
423
+ raw_model_output = agent.invoke(messages)
424
+ logger.debug(f"LLM raw response length: {len(raw_model_output)}")
425
+ logger.debug(f"LLM raw output: {raw_model_output}")
426
+ except Exception as e:
427
+ logger.error(f"Error calling model with prompt: {e}")
428
+ logger.error(f"Failed prompt: {messages}")
429
+ logger.error(
430
+ f"Prompt length: {sum(len(msg['content']) for msg in messages)}"
431
+ )
432
+ raise
433
+
434
+ cleaned_model_output = raw_model_output.strip()
321
435
 
322
436
  result = {
323
437
  mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
@@ -348,46 +462,39 @@ class MetaPromptOptimizer(BaseOptimizer):
348
462
  prompt: chat_prompt.ChatPrompt,
349
463
  dataset: Dataset,
350
464
  metric: Callable,
351
- experiment_config: Optional[Dict] = None,
352
- n_samples: Optional[int] = None,
465
+ experiment_config: dict | None = None,
466
+ n_samples: int | None = None,
353
467
  auto_continue: bool = False,
354
- agent_class: Optional[Type[OptimizableAgent]] = None,
468
+ agent_class: type[OptimizableAgent] | None = None,
355
469
  **kwargs: Any,
356
470
  ) -> OptimizationResult:
471
+ mcp_config = kwargs.pop("mcp_config", None)
472
+ candidate_generator = kwargs.pop("candidate_generator", None)
473
+ candidate_generator_kwargs = kwargs.pop("candidate_generator_kwargs", None)
474
+
357
475
  """
358
476
  Optimize a prompt using meta-reasoning.
359
477
 
360
478
  Args:
479
+ prompt: The prompt to optimize
361
480
  dataset: The dataset to evaluate against
362
481
  metric: The metric to use for evaluation
363
482
  experiment_config: A dictionary to log with the experiments
364
483
  n_samples: The number of dataset items to use for evaluation
365
484
  auto_continue: If True, the algorithm may continue if goal not met
366
- **kwargs: Additional arguments for evaluation
485
+ agent_class: Optional agent class to use
486
+ **kwargs: Additional arguments for evaluation, including:
487
+ mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
488
+ candidate_generator: Optional candidate generator
489
+ candidate_generator_kwargs: Optional kwargs for candidate generator
367
490
 
368
491
  Returns:
369
492
  OptimizationResult: Structured result containing optimization details
370
493
  """
371
- if not isinstance(prompt, chat_prompt.ChatPrompt):
372
- raise ValueError("Prompt must be a ChatPrompt object")
373
-
374
- if not isinstance(dataset, Dataset):
375
- raise ValueError("Dataset must be a Dataset object")
376
-
377
- if not callable(metric):
378
- raise ValueError(
379
- "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
380
- )
381
-
382
- if prompt.model is None:
383
- prompt.model = self.model
384
- if prompt.model_kwargs is None:
385
- prompt.model_kwargs = self.model_kwargs
386
-
387
- if agent_class is None:
388
- self.agent_class = utils.create_litellm_agent_class(prompt)
389
- else:
390
- self.agent_class = agent_class
494
+ # Use base class validation and setup methods
495
+ self.validate_optimization_inputs(prompt, dataset, metric)
496
+ self.configure_prompt_model(prompt)
497
+ self.agent_class = self.setup_agent_class(prompt, agent_class)
391
498
 
392
499
  total_items = len(dataset.get_items())
393
500
  if n_samples is not None and n_samples > total_items:
@@ -398,7 +505,7 @@ class MetaPromptOptimizer(BaseOptimizer):
398
505
 
399
506
  optimization = None
400
507
  try:
401
- optimization = self._opik_client.create_optimization(
508
+ optimization = self.opik_client.create_optimization(
402
509
  dataset_name=dataset.name,
403
510
  objective_name=getattr(metric, "__name__", str(metric)),
404
511
  metadata={"optimizer": self.__class__.__name__},
@@ -424,6 +531,7 @@ class MetaPromptOptimizer(BaseOptimizer):
424
531
  "auto_continue": auto_continue,
425
532
  },
426
533
  verbose=self.verbose,
534
+ tools=getattr(prompt, "tools", None),
427
535
  )
428
536
 
429
537
  try:
@@ -436,6 +544,9 @@ class MetaPromptOptimizer(BaseOptimizer):
436
544
  experiment_config=experiment_config,
437
545
  n_samples=n_samples,
438
546
  auto_continue=auto_continue,
547
+ mcp_config=mcp_config,
548
+ candidate_generator=candidate_generator,
549
+ candidate_generator_kwargs=candidate_generator_kwargs,
439
550
  **kwargs,
440
551
  )
441
552
  if optimization:
@@ -449,40 +560,104 @@ class MetaPromptOptimizer(BaseOptimizer):
449
560
  logger.debug("Optimization marked as cancelled")
450
561
  raise e
451
562
 
563
+ def optimize_mcp(
564
+ self,
565
+ prompt: chat_prompt.ChatPrompt,
566
+ dataset: Dataset,
567
+ metric: Callable,
568
+ *,
569
+ tool_name: str,
570
+ second_pass: MCPSecondPassCoordinator,
571
+ experiment_config: dict | None = None,
572
+ n_samples: int | None = None,
573
+ auto_continue: bool = False,
574
+ agent_class: type[OptimizableAgent] | None = None,
575
+ fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
576
+ fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
577
+ allow_tool_use_on_second_pass: bool = False,
578
+ **kwargs: Any,
579
+ ) -> OptimizationResult:
580
+ panel_style = kwargs.pop("tool_panel_style", "bright_magenta")
581
+
582
+ if prompt.tools is None or not prompt.tools:
583
+ raise ValueError("Prompt must include tools for MCP optimization")
584
+
585
+ fallback_args_fn = fallback_arguments or extract_tool_arguments
586
+
587
+ if fallback_invoker is None:
588
+ function_map = prompt.function_map or {}
589
+ fallback_invoker = function_map.get(tool_name)
590
+
591
+ mcp_config = MCPExecutionConfig(
592
+ coordinator=second_pass,
593
+ tool_name=tool_name,
594
+ fallback_arguments=fallback_args_fn,
595
+ fallback_invoker=fallback_invoker,
596
+ allow_tool_use_on_second_pass=allow_tool_use_on_second_pass,
597
+ )
598
+
599
+ tool_segment_id = f"tool:{tool_name}"
600
+ segments = extract_prompt_segments(prompt)
601
+ if tool_segment_id not in {segment.segment_id for segment in segments}:
602
+ raise ValueError(f"Tool '{tool_name}' not present in prompt tools")
603
+
604
+ return self.optimize_prompt(
605
+ prompt=prompt,
606
+ dataset=dataset,
607
+ metric=metric,
608
+ experiment_config=experiment_config,
609
+ n_samples=n_samples,
610
+ auto_continue=auto_continue,
611
+ agent_class=agent_class,
612
+ mcp_config=mcp_config,
613
+ candidate_generator=self._generate_mcp_candidate_prompts,
614
+ candidate_generator_kwargs={
615
+ "tool_segment_id": tool_segment_id,
616
+ "tool_name": tool_name,
617
+ "panel_style": panel_style,
618
+ },
619
+ tool_panel_style=panel_style,
620
+ **kwargs,
621
+ )
622
+
452
623
  def _optimize_prompt(
453
624
  self,
454
- optimization_id: Optional[str],
625
+ optimization_id: str | None,
455
626
  prompt: chat_prompt.ChatPrompt,
456
627
  dataset: Dataset,
457
628
  metric: Callable,
458
- experiment_config: Optional[Dict],
459
- n_samples: Optional[int],
629
+ experiment_config: dict | None,
630
+ n_samples: int | None,
460
631
  auto_continue: bool,
632
+ mcp_config: MCPExecutionConfig | None = None,
633
+ candidate_generator: None
634
+ | (Callable[..., list[chat_prompt.ChatPrompt]]) = None,
635
+ candidate_generator_kwargs: dict[str, Any] | None = None,
636
+ tool_panel_style: str = "bright_magenta",
461
637
  **kwargs: Any,
462
638
  ) -> OptimizationResult:
463
639
  self.auto_continue = auto_continue
464
640
  self.dataset = dataset
465
641
  self.prompt = prompt
466
- self.llm_call_counter = 0 # Reset counter for run
642
+ self.reset_counters() # Reset counters for run
467
643
  initial_prompt = prompt
468
644
 
469
645
  current_prompt = prompt
470
- experiment_config = experiment_config or {}
471
- experiment_config = {
472
- **experiment_config,
473
- **{
474
- "optimizer": self.__class__.__name__,
475
- "agent_class": self.agent_class.__name__,
476
- "agent_config": prompt.to_dict(),
477
- "metric": getattr(metric, "__name__", str(metric)),
478
- "dataset": dataset.name,
479
- "configuration": {
480
- "prompt": prompt.get_messages(),
481
- "rounds": self.rounds,
482
- "num_prompts_per_round": self.num_prompts_per_round,
483
- },
484
- },
485
- }
646
+ configuration_updates = self._drop_none(
647
+ {
648
+ "rounds": self.rounds,
649
+ "num_prompts_per_round": self.num_prompts_per_round,
650
+ }
651
+ )
652
+ meta_metadata = {"stage": "initial"}
653
+ experiment_config = self._prepare_experiment_config(
654
+ prompt=prompt,
655
+ dataset=dataset,
656
+ metric=metric,
657
+ experiment_config=experiment_config,
658
+ configuration_updates=configuration_updates,
659
+ additional_metadata={"meta_prompt": meta_metadata},
660
+ )
486
661
 
487
662
  with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
488
663
  initial_score = self._evaluate_prompt(
@@ -494,10 +669,11 @@ class MetaPromptOptimizer(BaseOptimizer):
494
669
  experiment_config=experiment_config,
495
670
  use_full_dataset=n_samples is None,
496
671
  verbose=self.verbose,
672
+ mcp_config=mcp_config,
497
673
  )
498
674
  best_score = initial_score
499
675
  best_prompt = current_prompt
500
- rounds: List[OptimizationRound] = []
676
+ rounds: list[OptimizationRound] = []
501
677
 
502
678
  baseline_reporter.set_score(initial_score)
503
679
 
@@ -510,8 +686,11 @@ class MetaPromptOptimizer(BaseOptimizer):
510
686
  previous_best_score = best_score
511
687
 
512
688
  # Step 1. Create a set of candidate prompts
689
+ generator = candidate_generator or self._generate_candidate_prompts
690
+ generator_kwargs = dict(candidate_generator_kwargs or {})
691
+
513
692
  try:
514
- candidate_prompts = self._generate_candidate_prompts(
693
+ candidate_prompts = generator(
515
694
  project_name=self.agent_class.project_name,
516
695
  current_prompt=best_prompt,
517
696
  best_score=best_score,
@@ -519,25 +698,25 @@ class MetaPromptOptimizer(BaseOptimizer):
519
698
  previous_rounds=rounds,
520
699
  metric=metric,
521
700
  optimization_id=optimization_id,
701
+ **generator_kwargs,
522
702
  )
523
703
  except Exception as e:
524
704
  round_reporter.failed_to_generate(self.num_prompts_per_round, e)
525
705
  continue
526
706
 
527
707
  # Step 2. Score each candidate prompt
528
- prompt_scores: List[Tuple[chat_prompt.ChatPrompt, float]] = []
708
+ prompt_scores: list[tuple[chat_prompt.ChatPrompt, float]] = []
529
709
  for candidate_count, prompt in enumerate(candidate_prompts):
530
710
  with reporting.display_prompt_candidate_scoring_report(
531
711
  verbose=self.verbose
532
712
  ) as eval_report:
533
713
  eval_report.set_generated_prompts(candidate_count, prompt)
534
714
 
535
- new_prompt = current_prompt.copy()
536
- new_prompt.set_messages(prompt.get_messages())
715
+ candidate_prompt = prompt.copy()
537
716
 
538
717
  try:
539
718
  prompt_score = self._evaluate_prompt(
540
- prompt=new_prompt,
719
+ prompt=candidate_prompt,
541
720
  optimization_id=optimization_id,
542
721
  dataset=dataset,
543
722
  metric=metric,
@@ -545,11 +724,12 @@ class MetaPromptOptimizer(BaseOptimizer):
545
724
  use_full_dataset=False,
546
725
  experiment_config=experiment_config,
547
726
  verbose=self.verbose,
727
+ mcp_config=mcp_config,
548
728
  )
549
729
 
550
730
  eval_report.set_final_score(best_score, prompt_score)
551
731
  except Exception:
552
- print("Failed evaluating agent; continuing...")
732
+ logger.warning("Failed evaluating agent; continuing...")
553
733
  prompt_score = 0
554
734
 
555
735
  prompt_scores.append((prompt, prompt_score))
@@ -584,24 +764,39 @@ class MetaPromptOptimizer(BaseOptimizer):
584
764
  best_score = best_cand_score_avg
585
765
  best_prompt = best_candidate_this_round
586
766
 
767
+ if tool_panel_style and getattr(best_prompt, "tools", None):
768
+ description = (
769
+ best_prompt.tools[0].get("function", {}).get("description", "")
770
+ if best_prompt.tools
771
+ else ""
772
+ )
773
+ if description.strip():
774
+ reporting.display_tool_description(
775
+ description.strip(),
776
+ "Final tool description",
777
+ tool_panel_style,
778
+ )
779
+
587
780
  reporting.display_result(
588
781
  initial_score,
589
782
  best_score,
590
783
  best_prompt.get_messages() if best_prompt is not None else [],
591
784
  verbose=self.verbose,
785
+ tools=getattr(best_prompt, "tools", None) if best_prompt else None,
592
786
  )
593
787
 
594
788
  return self._create_result(
595
789
  metric,
596
- initial_prompt=initial_prompt.get_messages()
597
- if initial_prompt is not None
598
- else [],
790
+ initial_prompt=(
791
+ initial_prompt.get_messages() if initial_prompt is not None else []
792
+ ),
599
793
  best_prompt=best_prompt.get_messages() if best_prompt is not None else [],
600
794
  best_score=best_score,
601
795
  initial_score=initial_score,
602
796
  rounds=rounds,
603
797
  dataset_id=dataset.id,
604
798
  optimization_id=optimization_id,
799
+ best_tools=getattr(best_prompt, "tools", None) if best_prompt else None,
605
800
  )
606
801
 
607
802
  def _calculate_improvement(
@@ -620,19 +815,24 @@ class MetaPromptOptimizer(BaseOptimizer):
620
815
  current_best_prompt: chat_prompt.ChatPrompt,
621
816
  current_best_score: float,
622
817
  best_prompt_overall: chat_prompt.ChatPrompt,
623
- evaluated_candidates: List[Tuple[chat_prompt.ChatPrompt, float]],
818
+ evaluated_candidates: list[tuple[chat_prompt.ChatPrompt, float]],
624
819
  previous_best_score: float,
625
820
  improvement_this_round: float,
626
821
  ) -> OptimizationRound:
627
822
  """Create an OptimizationRound object with the current round's data."""
628
- generated_prompts_log = []
823
+ generated_prompts_log: list[dict[str, Any]] = []
629
824
  for prompt, score in evaluated_candidates:
630
825
  improvement_vs_prev = self._calculate_improvement(
631
826
  score, previous_best_score
632
827
  )
828
+ tool_entries: list[Any] = []
829
+ if getattr(prompt, "tools", None):
830
+ tool_entries = copy.deepcopy(list(prompt.tools or []))
831
+
633
832
  generated_prompts_log.append(
634
833
  {
635
834
  "prompt": prompt.get_messages(),
835
+ "tools": tool_entries,
636
836
  "score": score,
637
837
  "improvement": improvement_vs_prev,
638
838
  }
@@ -651,13 +851,14 @@ class MetaPromptOptimizer(BaseOptimizer):
651
851
  def _create_result(
652
852
  self,
653
853
  metric: Callable,
654
- initial_prompt: List[Dict[str, str]],
655
- best_prompt: List[Dict[str, str]],
854
+ initial_prompt: list[dict[str, str]],
855
+ best_prompt: list[dict[str, str]],
656
856
  best_score: float,
657
857
  initial_score: float,
658
- rounds: List[OptimizationRound],
659
- dataset_id: Optional[str],
660
- optimization_id: Optional[str],
858
+ rounds: list[OptimizationRound],
859
+ dataset_id: str | None,
860
+ optimization_id: str | None,
861
+ best_tools: list[dict[str, Any]] | None,
661
862
  ) -> OptimizationResult:
662
863
  """Create the final OptimizationResult object."""
663
864
  details = {
@@ -670,6 +871,18 @@ class MetaPromptOptimizer(BaseOptimizer):
670
871
  "temperature": self.model_kwargs.get("temperature"),
671
872
  }
672
873
 
874
+ if best_tools:
875
+ details["final_tools"] = best_tools
876
+
877
+ tool_prompts = None
878
+ if best_tools:
879
+ tool_prompts = {
880
+ (tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
881
+ "function", {}
882
+ ).get("description")
883
+ for idx, tool in enumerate(best_tools)
884
+ }
885
+
673
886
  return OptimizationResult(
674
887
  optimizer=self.__class__.__name__,
675
888
  prompt=best_prompt,
@@ -679,8 +892,10 @@ class MetaPromptOptimizer(BaseOptimizer):
679
892
  metric_name=getattr(metric, "__name__", str(metric)),
680
893
  details=details,
681
894
  llm_calls=self.llm_call_counter,
895
+ tool_calls=self.tool_call_counter,
682
896
  dataset_id=dataset_id,
683
897
  optimization_id=optimization_id,
898
+ tool_prompts=tool_prompts,
684
899
  )
685
900
 
686
901
  def _get_task_context(self, metric: Callable) -> str:
@@ -714,11 +929,11 @@ class MetaPromptOptimizer(BaseOptimizer):
714
929
  current_prompt: chat_prompt.ChatPrompt,
715
930
  best_score: float,
716
931
  round_num: int,
717
- previous_rounds: List[OptimizationRound],
932
+ previous_rounds: list[OptimizationRound],
718
933
  metric: Callable,
719
- optimization_id: Optional[str] = None,
720
- project_name: Optional[str] = None,
721
- ) -> List[chat_prompt.ChatPrompt]:
934
+ optimization_id: str | None = None,
935
+ project_name: str | None = None,
936
+ ) -> list[chat_prompt.ChatPrompt]:
722
937
  """Generate candidate prompts using meta-prompting."""
723
938
  with reporting.display_candidate_generation_report(
724
939
  self.num_prompts_per_round, verbose=self.verbose
@@ -819,7 +1034,7 @@ class MetaPromptOptimizer(BaseOptimizer):
819
1034
  )
820
1035
 
821
1036
  # Extract and log valid prompts
822
- valid_prompts: List[chat_prompt.ChatPrompt] = []
1037
+ valid_prompts: list[chat_prompt.ChatPrompt] = []
823
1038
  for item in json_result["prompts"]:
824
1039
  if (
825
1040
  isinstance(item, dict)
@@ -870,7 +1085,128 @@ class MetaPromptOptimizer(BaseOptimizer):
870
1085
  f"Unexpected error during candidate prompt generation: {e}"
871
1086
  )
872
1087
 
873
- def _build_history_context(self, previous_rounds: List[OptimizationRound]) -> str:
1088
+ def _generate_mcp_candidate_prompts(
1089
+ self,
1090
+ current_prompt: chat_prompt.ChatPrompt,
1091
+ best_score: float,
1092
+ round_num: int,
1093
+ previous_rounds: list[OptimizationRound],
1094
+ metric: Callable,
1095
+ tool_segment_id: str,
1096
+ tool_name: str,
1097
+ optimization_id: str | None = None,
1098
+ project_name: str | None = None,
1099
+ panel_style: str = "bright_magenta",
1100
+ ) -> list[chat_prompt.ChatPrompt]:
1101
+ segments = {
1102
+ segment.segment_id: segment
1103
+ for segment in extract_prompt_segments(current_prompt)
1104
+ }
1105
+ if tool_segment_id not in segments:
1106
+ raise ValueError(f"Tool segment '{tool_segment_id}' not found in prompt")
1107
+
1108
+ target_segment = segments[tool_segment_id]
1109
+ current_description = target_segment.content
1110
+ tool_metadata = target_segment.metadata.get("raw_tool", {})
1111
+
1112
+ history_context = self._build_history_context(previous_rounds)
1113
+
1114
+ instruction = textwrap.dedent(
1115
+ f"""
1116
+ Current tool name: {tool_name}
1117
+ Current tool description:
1118
+ ---
1119
+ {current_description}
1120
+ ---
1121
+
1122
+ Tool metadata (JSON):
1123
+ {json.dumps(tool_metadata, indent=2)}
1124
+
1125
+ Current best score: {best_score:.4f}
1126
+ {history_context}
1127
+
1128
+ Generate {self.num_prompts_per_round} improved descriptions for this tool.
1129
+ Each description should clarify expected input arguments and set explicit expectations
1130
+ for how the tool output must be used in the final response.
1131
+ Avoid changing unrelated parts of the prompt. Focus only on the description text for `{tool_name}`.
1132
+
1133
+ Return a JSON object of the form:
1134
+ {{
1135
+ "prompts": [
1136
+ {{
1137
+ "tool_description": "...",
1138
+ "improvement_focus": "...",
1139
+ "reasoning": "..."
1140
+ }}
1141
+ ]
1142
+ }}
1143
+ """
1144
+ ).strip()
1145
+
1146
+ with reporting.display_candidate_generation_report(
1147
+ self.num_prompts_per_round, verbose=self.verbose
1148
+ ) as candidate_generation_report:
1149
+ try:
1150
+ content = self._call_model(
1151
+ project_name,
1152
+ messages=[
1153
+ {"role": "system", "content": self._REASONING_SYSTEM_PROMPT},
1154
+ {"role": "user", "content": instruction},
1155
+ ],
1156
+ is_reasoning=True,
1157
+ optimization_id=optimization_id,
1158
+ )
1159
+
1160
+ try:
1161
+ json_result = json.loads(content)
1162
+ except json.JSONDecodeError:
1163
+ import re
1164
+
1165
+ json_match = re.search(r"\{.*\}", content, re.DOTALL)
1166
+ if not json_match:
1167
+ raise ValueError("No JSON object found in reasoning output")
1168
+ json_result = json.loads(json_match.group())
1169
+
1170
+ prompts_payload = json_result.get("prompts")
1171
+ if not isinstance(prompts_payload, list):
1172
+ raise ValueError("Reasoning output missing 'prompts' list")
1173
+
1174
+ candidate_generation_report.set_generated_prompts()
1175
+
1176
+ candidates: list[chat_prompt.ChatPrompt] = []
1177
+ for item in prompts_payload:
1178
+ if not isinstance(item, dict):
1179
+ continue
1180
+ description = item.get("tool_description")
1181
+ if not isinstance(description, str) or not description.strip():
1182
+ continue
1183
+
1184
+ updated_prompt = apply_segment_updates(
1185
+ current_prompt,
1186
+ {tool_segment_id: description.strip()},
1187
+ )
1188
+ _sync_tool_description_in_system(updated_prompt)
1189
+ if (
1190
+ description.strip()
1191
+ and description.strip() != current_description.strip()
1192
+ ):
1193
+ reporting.display_tool_description(
1194
+ description.strip(),
1195
+ f"Round {round_num + 1} tool description",
1196
+ panel_style,
1197
+ )
1198
+ candidates.append(updated_prompt)
1199
+
1200
+ if not candidates:
1201
+ raise ValueError(
1202
+ "Reasoning output did not produce valid tool descriptions"
1203
+ )
1204
+
1205
+ return candidates
1206
+ except Exception as exc:
1207
+ raise ValueError(f"Error generating MCP prompt candidates: {exc}")
1208
+
1209
+ def _build_history_context(self, previous_rounds: list[OptimizationRound]) -> str:
874
1210
  """Build context from previous optimization rounds."""
875
1211
  if not previous_rounds:
876
1212
  return ""
@@ -896,7 +1232,7 @@ class MetaPromptOptimizer(BaseOptimizer):
896
1232
 
897
1233
  def _get_evaluation_subset(
898
1234
  self, dataset: opik.Dataset, min_size: int = 20, max_size: int = 100
899
- ) -> List[Dict[str, Any]]:
1235
+ ) -> list[dict[str, Any]]:
900
1236
  """Get a random subset of the dataset for evaluation.
901
1237
 
902
1238
  Returns: