opik-optimizer 2.1.2__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. opik_optimizer/__init__.py +2 -2
  2. opik_optimizer/base_optimizer.py +314 -145
  3. opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
  4. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
  5. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
  6. opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
  9. opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
  10. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
  11. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
  12. opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
  13. opik_optimizer/gepa_optimizer/reporting.py +164 -22
  14. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +221 -245
  15. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
  16. opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
  17. opik_optimizer/hierarchical_reflective_optimizer/reporting.py +287 -132
  18. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
  19. opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
  20. opik_optimizer/mipro_optimizer/__init__.py +2 -2
  21. opik_optimizer/mipro_optimizer/_lm.py +4 -4
  22. opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
  23. opik_optimizer/mipro_optimizer/utils.py +1 -0
  24. opik_optimizer/multi_metric_objective.py +33 -0
  25. opik_optimizer/optimizable_agent.py +7 -4
  26. opik_optimizer/optimization_config/chat_prompt.py +7 -10
  27. opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
  28. opik_optimizer/parameter_optimizer/reporting.py +148 -0
  29. opik_optimizer/reporting_utils.py +42 -15
  30. opik_optimizer/task_evaluator.py +26 -9
  31. opik_optimizer/utils/core.py +16 -2
  32. opik_optimizer/utils/prompt_segments.py +1 -2
  33. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
  34. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +37 -37
  35. opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
  36. opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
  37. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
  38. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
  39. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
@@ -15,10 +15,10 @@ from .few_shot_bayesian_optimizer import FewShotBayesianOptimizer
15
15
  from .gepa_optimizer import GepaOptimizer
16
16
  from .logging_config import setup_logging
17
17
  from .meta_prompt_optimizer import MetaPromptOptimizer
18
- from .mipro_optimizer import MiproOptimizer
19
18
  from .hierarchical_reflective_optimizer import HierarchicalReflectiveOptimizer
20
19
  from .optimization_config.configs import TaskConfig
21
20
  from .optimization_result import OptimizationResult
21
+ from .multi_metric_objective import MultiMetricObjective
22
22
  from .parameter_optimizer import (
23
23
  ParameterOptimizer,
24
24
  ParameterSearchSpace,
@@ -39,7 +39,6 @@ __all__ = [
39
39
  "FewShotBayesianOptimizer",
40
40
  "GepaOptimizer",
41
41
  "MetaPromptOptimizer",
42
- "MiproOptimizer",
43
42
  "EvolutionaryOptimizer",
44
43
  "HierarchicalReflectiveOptimizer",
45
44
  "ParameterOptimizer",
@@ -48,6 +47,7 @@ __all__ = [
48
47
  "setup_logging",
49
48
  "datasets",
50
49
  "TaskConfig",
50
+ "MultiMetricObjective",
51
51
  "ParameterSearchSpace",
52
52
  "ParameterSpec",
53
53
  "ParameterType",
@@ -13,14 +13,14 @@ import importlib.metadata
13
13
  import litellm
14
14
  from opik.rest_api.core import ApiError
15
15
  from opik.api_objects import optimization
16
- from opik import Dataset
16
+ from opik import Dataset, opik_context
17
17
  from pydantic import BaseModel
18
18
 
19
19
  from . import _throttle, optimization_result
20
20
  from .cache_config import initialize_cache
21
21
  from .optimization_config import chat_prompt, mappers
22
22
  from .optimizable_agent import OptimizableAgent
23
- from .utils import create_litellm_agent_class, optimization_context
23
+ from .utils import create_litellm_agent_class
24
24
  from . import task_evaluator
25
25
 
26
26
  _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
@@ -56,20 +56,24 @@ class BaseOptimizer(ABC):
56
56
  model: str,
57
57
  verbose: int = 1,
58
58
  seed: int = 42,
59
- **model_kwargs: Any,
59
+ model_parameters: dict[str, Any] | None = None,
60
60
  ) -> None:
61
61
  """
62
62
  Base class for optimizers.
63
63
 
64
64
  Args:
65
- model: LiteLLM model name
66
- verbose: Controls internal logging/progress bars (0=off, 1=on).
67
- seed: Random seed for reproducibility (default: 42)
68
- model_kwargs: additional args for model (eg, temperature)
65
+ model: LiteLLM model name for optimizer's internal reasoning/generation calls
66
+ verbose: Controls internal logging/progress bars (0=off, 1=on)
67
+ seed: Random seed for reproducibility
68
+ model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
69
+ Common params: temperature, max_tokens, max_completion_tokens, top_p,
70
+ presence_penalty, frequency_penalty.
71
+ See: https://docs.litellm.ai/docs/completion/input
72
+ Note: These params control the optimizer's reasoning model, NOT the prompt evaluation.
69
73
  """
70
74
  self.model = model
71
75
  self.reasoning_model = model
72
- self.model_kwargs = model_kwargs
76
+ self.model_parameters = model_parameters or {}
73
77
  self.verbose = verbose
74
78
  self.seed = seed
75
79
  self._history: list[OptimizationRound] = []
@@ -77,20 +81,22 @@ class BaseOptimizer(ABC):
77
81
  self.llm_call_counter = 0
78
82
  self.tool_call_counter = 0
79
83
  self._opik_client = None # Lazy initialization
84
+ self.current_optimization_id: str | None = None # Track current optimization
85
+ self.project_name: str = "Optimization" # Default project name
80
86
 
81
87
  # Initialize shared cache
82
88
  initialize_cache()
83
89
 
84
- def reset_counters(self) -> None:
90
+ def _reset_counters(self) -> None:
85
91
  """Reset all call counters for a new optimization run."""
86
92
  self.llm_call_counter = 0
87
93
  self.tool_call_counter = 0
88
94
 
89
- def increment_llm_counter(self) -> None:
95
+ def _increment_llm_counter(self) -> None:
90
96
  """Increment the LLM call counter."""
91
97
  self.llm_call_counter += 1
92
98
 
93
- def increment_tool_counter(self) -> None:
99
+ def _increment_tool_counter(self) -> None:
94
100
  """Increment the tool call counter."""
95
101
  self.tool_call_counter += 1
96
102
 
@@ -100,7 +106,7 @@ class BaseOptimizer(ABC):
100
106
  Should be called when the optimizer is no longer needed.
101
107
  """
102
108
  # Reset counters
103
- self.reset_counters()
109
+ self._reset_counters()
104
110
 
105
111
  # Clear history to free memory
106
112
  self._history.clear()
@@ -129,7 +135,7 @@ class BaseOptimizer(ABC):
129
135
  self._opik_client = opik.Opik()
130
136
  return self._opik_client
131
137
 
132
- def validate_optimization_inputs(
138
+ def _validate_optimization_inputs(
133
139
  self, prompt: "chat_prompt.ChatPrompt", dataset: "Dataset", metric: Callable
134
140
  ) -> None:
135
141
  """
@@ -154,7 +160,7 @@ class BaseOptimizer(ABC):
154
160
  "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
155
161
  )
156
162
 
157
- def setup_agent_class(
163
+ def _setup_agent_class(
158
164
  self, prompt: "chat_prompt.ChatPrompt", agent_class: Any = None
159
165
  ) -> Any:
160
166
  """
@@ -172,19 +178,288 @@ class BaseOptimizer(ABC):
172
178
  else:
173
179
  return agent_class
174
180
 
175
- def configure_prompt_model(self, prompt: "chat_prompt.ChatPrompt") -> None:
181
+ def _extract_tool_prompts(
182
+ self, tools: list[dict[str, Any]] | None
183
+ ) -> dict[str, str] | None:
176
184
  """
177
- Configure prompt model and model_kwargs if not set.
185
+ Extract tool names and descriptions from tools list.
178
186
 
179
187
  Args:
180
- prompt: The chat prompt to configure
188
+ tools: List of tool definitions in OpenAI/LiteLLM format
189
+
190
+ Returns:
191
+ Dictionary mapping tool names to descriptions, or None if no tools
181
192
  """
182
- # Only configure if prompt is a valid ChatPrompt object
183
- if hasattr(prompt, "model") and hasattr(prompt, "model_kwargs"):
184
- if prompt.model is None:
185
- prompt.model = self.model
186
- if prompt.model_kwargs is None:
187
- prompt.model_kwargs = self.model_kwargs
193
+ if not tools:
194
+ return None
195
+
196
+ return {
197
+ (tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
198
+ "function", {}
199
+ ).get("description", "")
200
+ for idx, tool in enumerate(tools)
201
+ }
202
+
203
+ # ------------------------------------------------------------------
204
+ # LLM call methods
205
+ # ------------------------------------------------------------------
206
+
207
+ def _prepare_model_params(
208
+ self,
209
+ call_time_params: dict[str, Any],
210
+ response_model: type[BaseModel] | None = None,
211
+ is_reasoning: bool = False,
212
+ ) -> dict[str, Any]:
213
+ """
214
+ Prepare parameters for LiteLLM call by merging and adding monitoring.
215
+
216
+ Args:
217
+ call_time_params: Dict of LiteLLM params from call-time overrides
218
+ response_model: Optional Pydantic model for structured output
219
+ is_reasoning: Flag for metadata tagging
220
+
221
+ Returns:
222
+ Dictionary ready for litellm.completion/acompletion
223
+ """
224
+ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
225
+
226
+ # Merge optimizer's model_parameters with call-time overrides
227
+ merged_params = {**self.model_parameters, **call_time_params}
228
+
229
+ # Add Opik monitoring wrapper
230
+ final_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
231
+ merged_params
232
+ )
233
+
234
+ # Add reasoning metadata if applicable
235
+ if is_reasoning and "metadata" in final_params:
236
+ if "opik_call_type" not in final_params["metadata"]:
237
+ final_params["metadata"]["opik_call_type"] = "reasoning"
238
+
239
+ # Configure project_name and tags for Opik tracing
240
+ if "metadata" not in final_params:
241
+ final_params["metadata"] = {}
242
+ if "opik" not in final_params["metadata"]:
243
+ final_params["metadata"]["opik"] = {}
244
+
245
+ # Set project name for optimizer reasoning calls
246
+ final_params["metadata"]["opik"]["project_name"] = self.project_name
247
+
248
+ # Add tags if optimization_id is available
249
+ if self.current_optimization_id:
250
+ final_params["metadata"]["opik"]["tags"] = [
251
+ self.current_optimization_id,
252
+ "Prompt Optimization",
253
+ ]
254
+
255
+ # Add structured output support
256
+ if response_model is not None:
257
+ final_params["response_format"] = response_model
258
+
259
+ return final_params
260
+
261
+ def _parse_response(
262
+ self,
263
+ response: Any,
264
+ response_model: type[BaseModel] | None = None,
265
+ ) -> BaseModel | str:
266
+ """
267
+ Parse LiteLLM response, with optional structured output parsing.
268
+
269
+ Args:
270
+ response: The response from litellm.completion/acompletion
271
+ response_model: Optional Pydantic model for structured output
272
+
273
+ Returns:
274
+ If response_model is provided, returns an instance of that model.
275
+ Otherwise, returns the raw string response.
276
+ """
277
+ content = response.choices[0].message.content
278
+
279
+ # When using structured outputs with Pydantic models, LiteLLM automatically
280
+ # parses the response. Parse the JSON string into the Pydantic model
281
+ if response_model is not None:
282
+ return response_model.model_validate_json(content)
283
+
284
+ return content
285
+
286
+ def _build_call_time_params(
287
+ self,
288
+ temperature: float | None = None,
289
+ max_tokens: int | None = None,
290
+ max_completion_tokens: int | None = None,
291
+ top_p: float | None = None,
292
+ presence_penalty: float | None = None,
293
+ frequency_penalty: float | None = None,
294
+ metadata: dict[str, Any] | None = None,
295
+ ) -> dict[str, Any]:
296
+ """
297
+ Build dictionary of call-time LiteLLM parameter overrides.
298
+
299
+ Args:
300
+ temperature: Sampling temperature (0-2)
301
+ max_tokens: Maximum tokens to generate
302
+ max_completion_tokens: Upper bound for generated tokens
303
+ top_p: Nucleus sampling probability mass
304
+ presence_penalty: Penalty for new tokens based on presence
305
+ frequency_penalty: Penalty for new tokens based on frequency
306
+ metadata: Optional metadata dict for monitoring
307
+
308
+ Returns:
309
+ Dictionary of non-None parameters for LiteLLM
310
+ """
311
+ call_time_params: dict[str, Any] = {}
312
+ if temperature is not None:
313
+ call_time_params["temperature"] = temperature
314
+ if max_tokens is not None:
315
+ call_time_params["max_tokens"] = max_tokens
316
+ if max_completion_tokens is not None:
317
+ call_time_params["max_completion_tokens"] = max_completion_tokens
318
+ if top_p is not None:
319
+ call_time_params["top_p"] = top_p
320
+ if presence_penalty is not None:
321
+ call_time_params["presence_penalty"] = presence_penalty
322
+ if frequency_penalty is not None:
323
+ call_time_params["frequency_penalty"] = frequency_penalty
324
+ if metadata is not None:
325
+ call_time_params["metadata"] = metadata
326
+ return call_time_params
327
+
328
+ @_throttle.rate_limited(_limiter)
329
+ def _call_model(
330
+ self,
331
+ messages: list[dict[str, str]],
332
+ model: str | None = None,
333
+ seed: int | None = None,
334
+ response_model: type[BaseModel] | None = None,
335
+ is_reasoning: bool = False,
336
+ # Explicit call-time overrides for LiteLLM params
337
+ temperature: float | None = None,
338
+ max_tokens: int | None = None,
339
+ max_completion_tokens: int | None = None,
340
+ top_p: float | None = None,
341
+ presence_penalty: float | None = None,
342
+ frequency_penalty: float | None = None,
343
+ # Optimizer-specific metadata (not passed to LiteLLM)
344
+ optimization_id: str | None = None,
345
+ metadata: dict[str, Any] | None = None,
346
+ ) -> BaseModel | str:
347
+ """
348
+ Call the LLM model with optional structured output.
349
+
350
+ Args:
351
+ messages: List of message dictionaries with 'role' and 'content' keys
352
+ model: The model to use (defaults to self.model)
353
+ seed: Random seed for reproducibility (defaults to self.seed)
354
+ response_model: Optional Pydantic model for structured output
355
+ is_reasoning: Flag for metadata tagging (not passed to LiteLLM)
356
+ temperature: Sampling temperature (0-2)
357
+ max_tokens: Maximum tokens to generate
358
+ max_completion_tokens: Upper bound for generated tokens
359
+ top_p: Nucleus sampling probability mass
360
+ presence_penalty: Penalty for new tokens based on presence
361
+ frequency_penalty: Penalty for new tokens based on frequency
362
+ optimization_id: Optional ID for optimization tracking (metadata only)
363
+ metadata: Optional metadata dict for monitoring
364
+
365
+ Returns:
366
+ If response_model is provided, returns an instance of that model.
367
+ Otherwise, returns the raw string response.
368
+ """
369
+ self._increment_llm_counter()
370
+
371
+ # Build dict of call-time LiteLLM parameter overrides (non-None only)
372
+ call_time_params = self._build_call_time_params(
373
+ temperature=temperature,
374
+ max_tokens=max_tokens,
375
+ max_completion_tokens=max_completion_tokens,
376
+ top_p=top_p,
377
+ presence_penalty=presence_penalty,
378
+ frequency_penalty=frequency_penalty,
379
+ metadata=metadata,
380
+ )
381
+
382
+ final_params_for_litellm = self._prepare_model_params(
383
+ call_time_params, response_model, is_reasoning
384
+ )
385
+
386
+ response = litellm.completion(
387
+ model=model or self.model,
388
+ messages=messages,
389
+ seed=seed if seed is not None else self.seed,
390
+ num_retries=6,
391
+ **final_params_for_litellm,
392
+ )
393
+
394
+ return self._parse_response(response, response_model)
395
+
396
+ @_throttle.rate_limited(_limiter)
397
+ async def _call_model_async(
398
+ self,
399
+ messages: list[dict[str, str]],
400
+ model: str | None = None,
401
+ seed: int | None = None,
402
+ response_model: type[BaseModel] | None = None,
403
+ is_reasoning: bool = False,
404
+ # Explicit call-time overrides for LiteLLM params
405
+ temperature: float | None = None,
406
+ max_tokens: int | None = None,
407
+ max_completion_tokens: int | None = None,
408
+ top_p: float | None = None,
409
+ presence_penalty: float | None = None,
410
+ frequency_penalty: float | None = None,
411
+ # Optimizer-specific metadata (not passed to LiteLLM)
412
+ optimization_id: str | None = None,
413
+ metadata: dict[str, Any] | None = None,
414
+ ) -> BaseModel | str:
415
+ """
416
+ Async version of _call_model using litellm.acompletion.
417
+
418
+ Args:
419
+ messages: List of message dictionaries with 'role' and 'content' keys
420
+ model: The model to use (defaults to self.model)
421
+ seed: Random seed for reproducibility (defaults to self.seed)
422
+ response_model: Optional Pydantic model for structured output
423
+ is_reasoning: Flag for metadata tagging (not passed to LiteLLM)
424
+ temperature: Sampling temperature (0-2)
425
+ max_tokens: Maximum tokens to generate
426
+ max_completion_tokens: Upper bound for generated tokens
427
+ top_p: Nucleus sampling probability mass
428
+ presence_penalty: Penalty for new tokens based on presence
429
+ frequency_penalty: Penalty for new tokens based on frequency
430
+ optimization_id: Optional ID for optimization tracking (metadata only)
431
+ metadata: Optional metadata dict for monitoring
432
+
433
+ Returns:
434
+ If response_model is provided, returns an instance of that model.
435
+ Otherwise, returns the raw string response.
436
+ """
437
+ self._increment_llm_counter()
438
+
439
+ # Build dict of call-time LiteLLM parameter overrides (non-None only)
440
+ call_time_params = self._build_call_time_params(
441
+ temperature=temperature,
442
+ max_tokens=max_tokens,
443
+ max_completion_tokens=max_completion_tokens,
444
+ top_p=top_p,
445
+ presence_penalty=presence_penalty,
446
+ frequency_penalty=frequency_penalty,
447
+ metadata=metadata,
448
+ )
449
+
450
+ final_params_for_litellm = self._prepare_model_params(
451
+ call_time_params, response_model, is_reasoning
452
+ )
453
+
454
+ response = await litellm.acompletion(
455
+ model=model or self.model,
456
+ messages=messages,
457
+ seed=seed if seed is not None else self.seed,
458
+ num_retries=6,
459
+ **final_params_for_litellm,
460
+ )
461
+
462
+ return self._parse_response(response, response_model)
188
463
 
189
464
  # ------------------------------------------------------------------
190
465
  # Experiment metadata helpers
@@ -292,7 +567,7 @@ class BaseOptimizer(ABC):
292
567
  "name": self.__class__.__name__,
293
568
  "version": _OPTIMIZER_VERSION,
294
569
  "model": self.model,
295
- "model_kwargs": self.model_kwargs or None,
570
+ "model_parameters": self.model_parameters or None,
296
571
  "seed": getattr(self, "seed", None),
297
572
  "num_threads": getattr(self, "num_threads", None),
298
573
  }
@@ -362,39 +637,10 @@ class BaseOptimizer(ABC):
362
637
  base_config = self._deep_merge_dicts(base_config, additional_metadata)
363
638
 
364
639
  if experiment_config:
365
- base_config = self._deep_merge_dicts(base_config, experiment_config)
640
+ base_config = self._deep_merge_dicts(experiment_config, base_config)
366
641
 
367
642
  return self._drop_none(base_config)
368
643
 
369
- def create_optimization_context(
370
- self, dataset: "Dataset", metric: Callable, metadata: dict | None = None
371
- ) -> Any:
372
- """
373
- Create optimization context for tracking.
374
-
375
- Args:
376
- dataset: The dataset being optimized
377
- metric: The metric function
378
- metadata: Additional metadata
379
-
380
- Returns:
381
- Optimization context manager
382
- """
383
- context_metadata = {
384
- "optimizer": self.__class__.__name__,
385
- "model": self.model,
386
- "seed": self.seed,
387
- }
388
- if metadata:
389
- context_metadata.update(metadata)
390
-
391
- return optimization_context(
392
- client=self.opik_client,
393
- dataset_name=dataset.name,
394
- objective_name=metric.__name__,
395
- metadata=context_metadata,
396
- )
397
-
398
644
  @abstractmethod
399
645
  def optimize_prompt(
400
646
  self,
@@ -405,6 +651,8 @@ class BaseOptimizer(ABC):
405
651
  n_samples: int | None = None,
406
652
  auto_continue: bool = False,
407
653
  agent_class: type[OptimizableAgent] | None = None,
654
+ project_name: str = "Optimization",
655
+ *args: Any,
408
656
  **kwargs: Any,
409
657
  ) -> optimization_result.OptimizationResult:
410
658
  """
@@ -418,93 +666,11 @@ class BaseOptimizer(ABC):
418
666
  input_key: input field of dataset
419
667
  output_key: output field of dataset
420
668
  experiment_config: Optional configuration for the experiment
669
+ project_name: Opik project name for logging traces (default: "Optimization")
421
670
  **kwargs: Additional arguments for optimization
422
671
  """
423
672
  pass
424
673
 
425
- def optimize_mcp(
426
- self,
427
- prompt: "chat_prompt.ChatPrompt",
428
- dataset: Dataset,
429
- metric: Callable,
430
- *,
431
- tool_name: str,
432
- second_pass: Any,
433
- experiment_config: dict | None = None,
434
- n_samples: int | None = None,
435
- auto_continue: bool = False,
436
- agent_class: type[OptimizableAgent] | None = None,
437
- fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
438
- fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
439
- allow_tool_use_on_second_pass: bool = False,
440
- **kwargs: Any,
441
- ) -> optimization_result.OptimizationResult:
442
- """
443
- Optimize prompts that rely on MCP (Model Context Protocol) tooling.
444
-
445
- This method provides a standardized interface for optimizing prompts that use
446
- external tools through the MCP protocol. It handles tool invocation, second-pass
447
- coordination, and fallback mechanisms.
448
-
449
- Args:
450
- prompt: The chat prompt to optimize, must include tools
451
- dataset: Opik dataset containing evaluation data
452
- metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
453
- tool_name: Name of the MCP tool to use for optimization
454
- second_pass: MCPSecondPassCoordinator for handling second-pass tool calls
455
- experiment_config: Optional configuration for the experiment
456
- n_samples: Number of samples to use for optimization (default: None)
457
- auto_continue: Whether to auto-continue optimization (default: False)
458
- agent_class: Custom agent class to use (default: None)
459
- fallback_invoker: Fallback function for tool invocation (default: None)
460
- fallback_arguments: Function to extract tool arguments (default: None)
461
- allow_tool_use_on_second_pass: Whether to allow tool use on second pass (default: False)
462
- **kwargs: Additional arguments for optimization
463
-
464
- Returns:
465
- OptimizationResult: The optimization result containing the optimized prompt and metrics
466
-
467
- Raises:
468
- NotImplementedError: If the optimizer doesn't implement MCP optimization
469
- ValueError: If the prompt doesn't include required tools
470
- """
471
- raise NotImplementedError(
472
- f"{self.__class__.__name__} does not implement optimize_mcp yet."
473
- )
474
-
475
- def optimize_parameter(
476
- self,
477
- prompt: "chat_prompt.ChatPrompt",
478
- dataset: Dataset,
479
- metric: Callable,
480
- parameter_space: Any,
481
- experiment_config: dict | None = None,
482
- n_trials: int | None = None,
483
- n_samples: int | None = None,
484
- agent_class: type[OptimizableAgent] | None = None,
485
- **kwargs: Any,
486
- ) -> optimization_result.OptimizationResult:
487
- """
488
- Optimize LLM call parameters such as temperature or top_k.
489
-
490
- Args:
491
- prompt: The chat prompt to evaluate with tuned parameters
492
- dataset: Dataset providing evaluation examples
493
- metric: Objective function to maximize
494
- parameter_space: Definition of the search space for tunable parameters
495
- experiment_config: Optional experiment metadata
496
- n_trials: Number of trials to run (optimizer specific default if None)
497
- n_samples: Number of dataset samples to evaluate per trial (None for all)
498
- agent_class: Optional custom agent class to execute evaluations
499
- **kwargs: Additional optimizer specific settings
500
-
501
- Returns:
502
- OptimizationResult: Structured result describing the best parameters found
503
- """
504
- raise NotImplementedError(
505
- f"{self.__class__.__name__} does not implement optimize_parameter yet."
506
- )
507
-
508
674
  def get_history(self) -> list[OptimizationRound]:
509
675
  """
510
676
  Get the optimization history.
@@ -523,7 +689,7 @@ class BaseOptimizer(ABC):
523
689
  """
524
690
  self._history.append(round_data)
525
691
 
526
- def update_optimization(
692
+ def _update_optimization(
527
693
  self, optimization: optimization.Optimization, status: str
528
694
  ) -> None:
529
695
  """
@@ -556,11 +722,6 @@ class BaseOptimizer(ABC):
556
722
  ) -> float:
557
723
  random.seed(seed)
558
724
 
559
- if prompt.model is None:
560
- prompt.model = self.model
561
- if prompt.model_kwargs is None:
562
- prompt.model_kwargs = self.model_kwargs
563
-
564
725
  self.agent_class: type[OptimizableAgent]
565
726
 
566
727
  if agent_class is None:
@@ -574,6 +735,13 @@ class BaseOptimizer(ABC):
574
735
  messages = prompt.get_messages(dataset_item)
575
736
  raw_model_output = agent.invoke(messages)
576
737
  cleaned_model_output = raw_model_output.strip()
738
+
739
+ # Add tags to trace for optimization tracking
740
+ if self.current_optimization_id:
741
+ opik_context.update_current_trace(
742
+ tags=[self.current_optimization_id, "Evaluation"]
743
+ )
744
+
577
745
  result = {
578
746
  mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
579
747
  }
@@ -591,6 +759,7 @@ class BaseOptimizer(ABC):
591
759
  raise Exception("Can't use n_samples and dataset_item_ids")
592
760
 
593
761
  all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
762
+ n_samples = min(n_samples, len(all_ids))
594
763
  dataset_item_ids = random.sample(all_ids, n_samples)
595
764
 
596
765
  score = task_evaluator.evaluate(
@@ -599,9 +768,9 @@ class BaseOptimizer(ABC):
599
768
  metric=metric,
600
769
  evaluated_task=llm_task,
601
770
  num_threads=n_threads,
602
- project_name=experiment_config.get("project_name"),
771
+ project_name=self.project_name,
603
772
  experiment_config=experiment_config,
604
- optimization_id=None,
773
+ optimization_id=self.current_optimization_id,
605
774
  verbose=verbose,
606
775
  )
607
776
  return score
@@ -89,7 +89,20 @@ class CrossoverOps:
89
89
  else:
90
90
  pass
91
91
 
92
- return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
92
+ child1 = creator.Individual(messages_1_orig)
93
+ child2 = creator.Individual(messages_2_orig)
94
+
95
+ # Preserve tools and function_map from parents
96
+ if hasattr(ind1, "tools"):
97
+ setattr(child1, "tools", getattr(ind1, "tools"))
98
+ if hasattr(ind1, "function_map"):
99
+ setattr(child1, "function_map", getattr(ind1, "function_map"))
100
+ if hasattr(ind2, "tools"):
101
+ setattr(child2, "tools", getattr(ind2, "tools"))
102
+ if hasattr(ind2, "function_map"):
103
+ setattr(child2, "function_map", getattr(ind2, "function_map"))
104
+
105
+ return child1, child2
93
106
 
94
107
  def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
95
108
  """Perform crossover by asking an LLM to blend two parent prompts."""
@@ -151,9 +164,23 @@ class CrossoverOps:
151
164
  raise ValueError("LLM response did not include any valid child prompts")
152
165
 
153
166
  # We only need two children; if only one returned, duplicate pattern from DEAP
154
- first_child = children[0]
155
- second_child = children[1] if len(children) > 1 else children[0]
156
- return creator.Individual(first_child), creator.Individual(second_child)
167
+ first_child_messages = children[0]
168
+ second_child_messages = children[1] if len(children) > 1 else children[0]
169
+
170
+ child1 = creator.Individual(first_child_messages)
171
+ child2 = creator.Individual(second_child_messages)
172
+
173
+ # Preserve tools and function_map from parents
174
+ if hasattr(ind1, "tools"):
175
+ setattr(child1, "tools", getattr(ind1, "tools"))
176
+ if hasattr(ind1, "function_map"):
177
+ setattr(child1, "function_map", getattr(ind1, "function_map"))
178
+ if hasattr(ind2, "tools"):
179
+ setattr(child2, "tools", getattr(ind2, "tools"))
180
+ if hasattr(ind2, "function_map"):
181
+ setattr(child2, "function_map", getattr(ind2, "function_map"))
182
+
183
+ return child1, child2
157
184
  except Exception as e:
158
185
  logger.warning(
159
186
  f"LLM-driven crossover failed: {e}. Falling back to DEAP crossover."