opik-optimizer 1.1.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/base_optimizer.py +376 -19
  3. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +80 -17
  4. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +179 -39
  5. opik_optimizer/evolutionary_optimizer/llm_support.py +3 -1
  6. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +17 -3
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +5 -0
  9. opik_optimizer/evolutionary_optimizer/prompts.py +47 -0
  10. opik_optimizer/evolutionary_optimizer/reporting.py +12 -0
  11. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +65 -59
  12. opik_optimizer/gepa_optimizer/adapter.py +5 -3
  13. opik_optimizer/gepa_optimizer/gepa_optimizer.py +163 -66
  14. opik_optimizer/mcp_utils/mcp_workflow.py +57 -3
  15. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +75 -69
  16. opik_optimizer/mipro_optimizer/_lm.py +10 -3
  17. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +1 -1
  18. opik_optimizer/mipro_optimizer/mipro_optimizer.py +96 -21
  19. opik_optimizer/optimizable_agent.py +5 -0
  20. opik_optimizer/optimization_result.py +1 -0
  21. opik_optimizer/utils/core.py +56 -14
  22. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/METADATA +97 -10
  23. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/RECORD +27 -26
  24. /opik_optimizer/{colbert.py → utils/colbert.py} +0 -0
  25. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/WHEEL +0 -0
  26. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/licenses/LICENSE +0 -0
  27. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import logging
4
2
  from contextlib import nullcontext
5
3
  from typing import Any, ContextManager
@@ -12,15 +10,18 @@ from opik.evaluation.metrics.score_result import ScoreResult
12
10
  from ..base_optimizer import BaseOptimizer
13
11
  from ..optimization_config import chat_prompt, mappers
14
12
  from ..optimization_result import OptimizationResult
15
- from ..utils import optimization_context, create_litellm_agent_class
16
- from ..logging_config import setup_logging as _setup_logging
13
+ from ..optimizable_agent import OptimizableAgent
14
+ from ..utils import (
15
+ optimization_context,
16
+ create_litellm_agent_class,
17
+ disable_experiment_reporting,
18
+ enable_experiment_reporting,
19
+ )
17
20
  from .. import task_evaluator
18
21
  from . import reporting as gepa_reporting
19
22
  from .adapter import OpikDataInst, OpikGEPAAdapter
20
23
 
21
-
22
- _setup_logging()
23
- LOGGER = logging.getLogger("opik_optimizer.gepa.optimizer")
24
+ logger = logging.getLogger(__name__)
24
25
 
25
26
 
26
27
  class GepaOptimizer(BaseOptimizer):
@@ -32,14 +33,63 @@ class GepaOptimizer(BaseOptimizer):
32
33
  project_name: str | None = None,
33
34
  reflection_model: str | None = None,
34
35
  verbose: int = 1,
36
+ seed: int = 42,
35
37
  **model_kwargs: Any,
36
38
  ) -> None:
37
- super().__init__(model=model, verbose=verbose, **model_kwargs)
39
+ # Validate required parameters
40
+ if model is None:
41
+ raise ValueError("model parameter is required and cannot be None")
42
+ if not isinstance(model, str):
43
+ raise ValueError(f"model must be a string, got {type(model).__name__}")
44
+ if not model.strip():
45
+ raise ValueError("model cannot be empty or whitespace-only")
46
+
47
+ # Validate optional parameters
48
+ if project_name is not None and not isinstance(project_name, str):
49
+ raise ValueError(
50
+ f"project_name must be a string or None, got {type(project_name).__name__}"
51
+ )
52
+
53
+ if reflection_model is not None and not isinstance(reflection_model, str):
54
+ raise ValueError(
55
+ f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
56
+ )
57
+
58
+ if not isinstance(verbose, int):
59
+ raise ValueError(
60
+ f"verbose must be an integer, got {type(verbose).__name__}"
61
+ )
62
+ if verbose < 0:
63
+ raise ValueError("verbose must be non-negative")
64
+
65
+ if not isinstance(seed, int):
66
+ raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
67
+
68
+ super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
38
69
  self.project_name = project_name
39
70
  self.reflection_model = reflection_model or model
40
71
  self.num_threads = self.model_kwargs.pop("num_threads", 6)
41
- self.seed = self.model_kwargs.pop("seed", 42)
42
72
  self._gepa_live_metric_calls = 0
73
+ self._adapter = None # Will be set during optimization
74
+
75
+ def get_optimizer_metadata(self) -> dict[str, Any]:
76
+ return {
77
+ "project_name": self.project_name,
78
+ "reflection_model": self.reflection_model,
79
+ }
80
+
81
+ def cleanup(self) -> None:
82
+ """
83
+ Clean up GEPA-specific resources.
84
+ """
85
+ # Call parent cleanup
86
+ super().cleanup()
87
+
88
+ # Clear GEPA-specific resources
89
+ self._adapter = None
90
+ self._gepa_live_metric_calls = 0
91
+
92
+ logger.debug("Cleaned up GEPA-specific resources")
43
93
 
44
94
  # ------------------------------------------------------------------
45
95
  # Helpers
@@ -105,21 +155,62 @@ class GepaOptimizer(BaseOptimizer):
105
155
  def optimize_prompt(
106
156
  self,
107
157
  prompt: chat_prompt.ChatPrompt,
108
- dataset: str | Dataset,
109
- metric: Callable[[dict[str, Any], str], ScoreResult],
110
- experiment_config: dict[str, Any] | None = None,
158
+ dataset: Dataset,
159
+ metric: Callable,
160
+ experiment_config: dict | None = None,
161
+ n_samples: int | None = None,
162
+ auto_continue: bool = False,
163
+ agent_class: type[OptimizableAgent] | None = None,
111
164
  **kwargs: Any,
112
165
  ) -> OptimizationResult:
113
- if isinstance(dataset, str):
114
- client = opik.Opik(project_name=self.project_name)
115
- dataset = client.get_dataset(dataset)
116
-
117
- max_metric_calls: int = int(kwargs.get("max_metric_calls", 30))
166
+ """
167
+ Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
168
+
169
+ Args:
170
+ prompt: The prompt to optimize
171
+ dataset: Opik Dataset to optimize on
172
+ metric: Metric function to evaluate on
173
+ experiment_config: Optional configuration for the experiment
174
+ n_samples: Optional number of items to test in the dataset
175
+ auto_continue: Whether to auto-continue optimization
176
+ agent_class: Optional agent class to use
177
+ **kwargs: GEPA-specific parameters:
178
+ max_metric_calls (int | None): Maximum number of metric evaluations (default: 30)
179
+ reflection_minibatch_size (int): Size of reflection minibatches (default: 3)
180
+ candidate_selection_strategy (str): Strategy for candidate selection (default: "pareto")
181
+ skip_perfect_score (bool): Skip candidates with perfect scores (default: True)
182
+ perfect_score (float): Score considered perfect (default: 1.0)
183
+ use_merge (bool): Enable merge operations (default: False)
184
+ max_merge_invocations (int): Maximum merge invocations (default: 5)
185
+ run_dir (str | None): Directory for run outputs (default: None)
186
+ track_best_outputs (bool): Track best outputs during optimization (default: False)
187
+ display_progress_bar (bool): Display progress bar (default: False)
188
+ seed (int): Random seed for reproducibility (default: 42)
189
+ raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
190
+ mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
191
+
192
+ Returns:
193
+ OptimizationResult: Result of the optimization
194
+ """
195
+ # Use base class validation and setup methods
196
+ self.validate_optimization_inputs(prompt, dataset, metric)
197
+
198
+ # Extract GEPA-specific parameters from kwargs
199
+ max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
118
200
  reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
119
201
  candidate_selection_strategy: str = str(
120
202
  kwargs.get("candidate_selection_strategy", "pareto")
121
203
  )
122
- n_samples: int | None = kwargs.get("n_samples")
204
+ skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
205
+ perfect_score: float = float(kwargs.get("perfect_score", 1.0))
206
+ use_merge: bool = kwargs.get("use_merge", False)
207
+ max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
208
+ run_dir: str | None = kwargs.get("run_dir", None)
209
+ track_best_outputs: bool = kwargs.get("track_best_outputs", False)
210
+ display_progress_bar: bool = kwargs.get("display_progress_bar", False)
211
+ seed: int = int(kwargs.get("seed", 42))
212
+ raise_on_exception: bool = kwargs.get("raise_on_exception", True)
213
+ kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
123
214
 
124
215
  prompt = prompt.copy()
125
216
  if self.project_name:
@@ -147,16 +238,19 @@ class GepaOptimizer(BaseOptimizer):
147
238
 
148
239
  opik_client = opik.Opik(project_name=self.project_name)
149
240
 
150
- with optimization_context(
151
- client=opik_client,
152
- dataset_name=dataset.name,
153
- objective_name=metric.__name__,
154
- metadata={"optimizer": self.__class__.__name__},
155
- ) as optimization:
156
- try:
157
- opt_id = optimization.id if optimization is not None else None
158
- except Exception:
159
- opt_id = None
241
+ disable_experiment_reporting()
242
+
243
+ try:
244
+ with optimization_context(
245
+ client=opik_client,
246
+ dataset_name=dataset.name,
247
+ objective_name=metric.__name__,
248
+ metadata={"optimizer": self.__class__.__name__},
249
+ ) as optimization:
250
+ try:
251
+ opt_id = optimization.id if optimization is not None else None
252
+ except Exception:
253
+ opt_id = None
160
254
 
161
255
  gepa_reporting.display_header(
162
256
  algorithm="GEPA",
@@ -210,7 +304,7 @@ class GepaOptimizer(BaseOptimizer):
210
304
  )
211
305
  baseline.set_score(initial_score)
212
306
  except Exception:
213
- LOGGER.exception("Baseline evaluation failed")
307
+ logger.exception("Baseline evaluation failed")
214
308
 
215
309
  adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
216
310
  adapter_prompt.project_name = self.project_name
@@ -244,10 +338,17 @@ class GepaOptimizer(BaseOptimizer):
244
338
  "task_lm": None,
245
339
  "reflection_lm": self.reflection_model,
246
340
  "candidate_selection_strategy": candidate_selection_strategy,
341
+ "skip_perfect_score": skip_perfect_score,
247
342
  "reflection_minibatch_size": reflection_minibatch_size,
343
+ "perfect_score": perfect_score,
344
+ "use_merge": use_merge,
345
+ "max_merge_invocations": max_merge_invocations,
248
346
  "max_metric_calls": max_metric_calls,
249
- "display_progress_bar": False,
250
- "track_best_outputs": False,
347
+ "run_dir": run_dir,
348
+ "track_best_outputs": track_best_outputs,
349
+ "display_progress_bar": display_progress_bar,
350
+ "seed": seed,
351
+ "raise_on_exception": raise_on_exception,
251
352
  "logger": gepa_reporting.RichGEPAOptimizerLogger(
252
353
  self, verbose=self.verbose
253
354
  ),
@@ -265,10 +366,13 @@ class GepaOptimizer(BaseOptimizer):
265
366
  with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
266
367
  gepa_result = gepa.optimize(**kwargs_gepa)
267
368
 
268
- try:
269
- opt_id = optimization.id if optimization is not None else None
270
- except Exception:
271
- opt_id = None
369
+ try:
370
+ opt_id = optimization.id if optimization is not None else None
371
+ except Exception:
372
+ opt_id = None
373
+
374
+ finally:
375
+ enable_experiment_reporting()
272
376
 
273
377
  # ------------------------------------------------------------------
274
378
  # Rescoring & result assembly
@@ -308,7 +412,7 @@ class GepaOptimizer(BaseOptimizer):
308
412
  try:
309
413
  score = float(self._evaluate_prompt_logged(**eval_kwargs))
310
414
  except Exception:
311
- LOGGER.debug("Rescoring failed for candidate %s", idx, exc_info=True)
415
+ logger.debug("Rescoring failed for candidate %s", idx, exc_info=True)
312
416
  score = 0.0
313
417
 
314
418
  rescored.append(score)
@@ -382,12 +486,12 @@ class GepaOptimizer(BaseOptimizer):
382
486
  try:
383
487
  self._evaluate_prompt_logged(**final_eval_kwargs)
384
488
  except Exception:
385
- LOGGER.debug("Final evaluation failed", exc_info=True)
489
+ logger.debug("Final evaluation failed", exc_info=True)
386
490
 
387
491
  per_item_scores: list[dict[str, Any]] = []
388
492
  try:
389
493
  analysis_prompt = final_prompt.copy()
390
- agent_cls = create_litellm_agent_class(analysis_prompt)
494
+ agent_cls = create_litellm_agent_class(analysis_prompt, optimizer_ref=self)
391
495
  agent = agent_cls(analysis_prompt)
392
496
  for item in items:
393
497
  messages = analysis_prompt.get_messages(item)
@@ -408,7 +512,7 @@ class GepaOptimizer(BaseOptimizer):
408
512
  }
409
513
  )
410
514
  except Exception:
411
- LOGGER.debug("Per-item diagnostics failed", exc_info=True)
515
+ logger.debug("Per-item diagnostics failed", exc_info=True)
412
516
 
413
517
  details: dict[str, Any] = {
414
518
  "model": self.model,
@@ -420,13 +524,13 @@ class GepaOptimizer(BaseOptimizer):
420
524
  "val_scores": val_scores,
421
525
  "opik_rescored_scores": rescored,
422
526
  "candidate_summary": candidate_rows,
423
- "best_candidate_iteration": candidate_rows[best_idx]["iteration"]
424
- if candidate_rows
425
- else 0,
527
+ "best_candidate_iteration": (
528
+ candidate_rows[best_idx]["iteration"] if candidate_rows else 0
529
+ ),
426
530
  "selected_candidate_index": best_idx,
427
- "selected_candidate_gepa_score": val_scores[best_idx]
428
- if best_idx < len(val_scores)
429
- else None,
531
+ "selected_candidate_gepa_score": (
532
+ val_scores[best_idx] if best_idx < len(val_scores) else None
533
+ ),
430
534
  "selected_candidate_opik_score": best_score,
431
535
  "gepa_live_metric_used": True,
432
536
  "gepa_live_metric_call_count": self._gepa_live_metric_calls,
@@ -446,16 +550,16 @@ class GepaOptimizer(BaseOptimizer):
446
550
  best_prompt_text, best_score, verbose=self.verbose
447
551
  )
448
552
 
449
- if LOGGER.isEnabledFor(logging.DEBUG):
553
+ if logger.isEnabledFor(logging.DEBUG):
450
554
  for idx, row in enumerate(candidate_rows):
451
- LOGGER.debug(
555
+ logger.debug(
452
556
  "candidate=%s source=%s gepa=%s opik=%s",
453
557
  idx,
454
558
  row.get("source"),
455
559
  row.get("gepa_score"),
456
560
  row.get("opik_score"),
457
561
  )
458
- LOGGER.debug(
562
+ logger.debug(
459
563
  "selected candidate idx=%s gepa=%s opik=%.4f",
460
564
  best_idx,
461
565
  details.get("selected_candidate_gepa_score"),
@@ -516,7 +620,8 @@ class GepaOptimizer(BaseOptimizer):
516
620
  if prompt.model_kwargs is None:
517
621
  prompt.model_kwargs = self.model_kwargs
518
622
 
519
- agent_class = create_litellm_agent_class(prompt)
623
+ agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
624
+ self.agent_class = agent_class
520
625
  agent = agent_class(prompt)
521
626
 
522
627
  def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
@@ -524,22 +629,14 @@ class GepaOptimizer(BaseOptimizer):
524
629
  raw = agent.invoke(messages)
525
630
  return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
526
631
 
527
- experiment_config = experiment_config or {}
528
- experiment_config["project_name"] = agent_class.__name__
529
- experiment_config = {
530
- **experiment_config,
531
- **{
532
- "optimizer": self.__class__.__name__,
533
- "agent_class": agent_class.__name__,
534
- "agent_config": prompt.to_dict(),
535
- "metric": metric.__name__,
536
- "dataset": dataset.name,
537
- "configuration": {
538
- "prompt": prompt.get_messages(),
539
- "gepa": (extra_metadata or {}),
540
- },
541
- },
542
- }
632
+ configuration_updates = self._drop_none({"gepa": extra_metadata})
633
+ experiment_config = self._prepare_experiment_config(
634
+ prompt=prompt,
635
+ dataset=dataset,
636
+ metric=metric,
637
+ experiment_config=experiment_config,
638
+ configuration_updates=configuration_updates,
639
+ )
543
640
 
544
641
  score = task_evaluator.evaluate(
545
642
  dataset=dataset,
@@ -547,7 +644,7 @@ class GepaOptimizer(BaseOptimizer):
547
644
  metric=metric,
548
645
  evaluated_task=llm_task,
549
646
  num_threads=self.num_threads,
550
- project_name=agent_class.project_name,
647
+ project_name=experiment_config.get("project_name"),
551
648
  experiment_config=experiment_config,
552
649
  optimization_id=optimization_id,
553
650
  n_samples=n_samples,
@@ -11,6 +11,7 @@ from __future__ import annotations
11
11
  import contextlib
12
12
  import copy
13
13
  import io
14
+ import json
14
15
  import logging
15
16
  import os
16
17
  import textwrap
@@ -346,12 +347,19 @@ class MCPToolInvocation:
346
347
  preview_label: str | None = None
347
348
  preview_chars: int = 160
348
349
  rate_limit_sleep: float = DEFAULT_MCP_RATELIMIT_SLEEP
350
+ cache_enabled: bool = True
349
351
  _logger: logging.Logger = field(default_factory=lambda: logger)
352
+ _cache: dict[str, str] = field(default_factory=dict, init=False)
350
353
 
351
354
  def __call__(self, **arguments: Any) -> str:
352
355
  return self.invoke(arguments)
353
356
 
354
- def invoke(self, arguments: Mapping[str, Any]) -> str:
357
+ def clear_cache(self) -> None:
358
+ self._cache.clear()
359
+
360
+ def invoke(
361
+ self, arguments: Mapping[str, Any], *, use_cache: bool | None = None
362
+ ) -> str:
355
363
  def call_tool(name: str, payload: dict[str, Any]) -> Any:
356
364
  if self.rate_limit_sleep > 0:
357
365
  time.sleep(self.rate_limit_sleep)
@@ -367,6 +375,19 @@ class MCPToolInvocation:
367
375
  if self.argument_adapter:
368
376
  prepared = self.argument_adapter(prepared, call_tool)
369
377
 
378
+ effective_cache = self.cache_enabled if use_cache is None else use_cache
379
+ cache_key: str | None = None
380
+ if effective_cache:
381
+ cache_key = self._make_cache_key(prepared)
382
+ cached_summary = self._cache.get(cache_key)
383
+ if cached_summary is not None:
384
+ if self.summary_handler:
385
+ self.summary_handler.record_summary(cached_summary)
386
+ self._logger.debug(
387
+ "MCP tool %s cache hit arguments=%s", self.tool_name, prepared
388
+ )
389
+ return cached_summary
390
+
370
391
  # TODO(opik-mcp): reuse a persistent MCP client so we avoid spawning a
371
392
  # new stdio subprocess for each call. This currently mirrors the
372
393
  # original blocking behaviour for stability.
@@ -391,11 +412,41 @@ class MCPToolInvocation:
391
412
  if self.summary_handler:
392
413
  self.summary_handler.record_summary(summary)
393
414
 
415
+ if effective_cache and cache_key is not None:
416
+ self._cache[cache_key] = summary
417
+
394
418
  if os.getenv("OPIK_DEBUG_MCP"):
395
419
  self._logger.info("MCP %s raw response:\n%s", label, text)
396
420
 
397
421
  return summary
398
422
 
423
+ def _make_cache_key(self, payload: Mapping[str, Any]) -> str:
424
+ try:
425
+ return json.dumps(payload, sort_keys=True, default=str)
426
+ except TypeError:
427
+ normalised = self._normalise_cache_payload(payload)
428
+ return json.dumps(normalised, sort_keys=True, default=str)
429
+
430
+ @staticmethod
431
+ def _normalise_cache_payload(value: Any) -> Any:
432
+ if isinstance(value, Mapping):
433
+ return {
434
+ key: MCPToolInvocation._normalise_cache_payload(val)
435
+ for key, val in sorted(value.items(), key=lambda item: str(item[0]))
436
+ }
437
+ if isinstance(value, list):
438
+ return [MCPToolInvocation._normalise_cache_payload(item) for item in value]
439
+ if isinstance(value, tuple):
440
+ return [MCPToolInvocation._normalise_cache_payload(item) for item in value]
441
+ if isinstance(value, set):
442
+ return [
443
+ MCPToolInvocation._normalise_cache_payload(item)
444
+ for item in sorted(value, key=repr)
445
+ ]
446
+ if isinstance(value, (str, int, float, bool)) or value is None:
447
+ return value
448
+ return str(value)
449
+
399
450
 
400
451
  def summarise_with_template(template: str) -> SummaryBuilder:
401
452
  """Return a summary builder that fills the provided template."""
@@ -465,6 +516,7 @@ def preview_second_pass(
465
516
  dataset_item: dict[str, Any],
466
517
  coordinator: MCPSecondPassCoordinator,
467
518
  agent_factory: Callable[[Any], Any],
519
+ seed: int = 42,
468
520
  ) -> None:
469
521
  """Debug helper mirroring the old inline scripts."""
470
522
 
@@ -472,7 +524,9 @@ def preview_second_pass(
472
524
  agent = agent_factory(prompt)
473
525
  base_messages = prompt.get_messages(dataset_item)
474
526
 
475
- raw_output = agent.llm_invoke(messages=base_messages, seed=42, allow_tool_use=True)
527
+ raw_output = agent.llm_invoke(
528
+ messages=base_messages, seed=seed, allow_tool_use=True
529
+ )
476
530
  logger.debug("Raw model output: %s", raw_output)
477
531
 
478
532
  second_pass_messages = coordinator.build_second_pass_messages(
@@ -484,7 +538,7 @@ def preview_second_pass(
484
538
  logger.debug("Second-pass messages: %s", second_pass_messages)
485
539
  final_output = agent.llm_invoke(
486
540
  messages=second_pass_messages,
487
- seed=101,
541
+ seed=seed,
488
542
  allow_tool_use=True,
489
543
  )
490
544
  else: