opik-optimizer 2.1.3__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. opik_optimizer/__init__.py +0 -2
  2. opik_optimizer/base_optimizer.py +313 -144
  3. opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
  4. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
  5. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
  6. opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
  9. opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
  10. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
  11. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
  12. opik_optimizer/gepa_optimizer/gepa_optimizer.py +345 -201
  13. opik_optimizer/gepa_optimizer/reporting.py +291 -22
  14. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
  15. opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
  16. opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
  17. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
  18. opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
  19. opik_optimizer/mipro_optimizer/__init__.py +2 -2
  20. opik_optimizer/mipro_optimizer/_lm.py +4 -4
  21. opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
  22. opik_optimizer/mipro_optimizer/utils.py +1 -0
  23. opik_optimizer/optimizable_agent.py +7 -4
  24. opik_optimizer/optimization_config/chat_prompt.py +7 -10
  25. opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
  26. opik_optimizer/parameter_optimizer/reporting.py +148 -0
  27. opik_optimizer/reporting_utils.py +60 -15
  28. opik_optimizer/utils/__init__.py +3 -0
  29. opik_optimizer/utils/candidate_utils.py +52 -0
  30. opik_optimizer/utils/core.py +35 -2
  31. opik_optimizer/utils/prompt_segments.py +1 -2
  32. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/METADATA +2 -3
  33. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/RECORD +36 -36
  34. opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
  36. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/WHEEL +0 -0
  37. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/licenses/LICENSE +0 -0
  38. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  import logging
2
- from contextlib import nullcontext
3
- from typing import Any, ContextManager
2
+ from typing import Any
4
3
  from collections.abc import Callable
5
4
 
6
5
  import opik
7
- from opik import Dataset
6
+ from opik import Dataset, opik_context
7
+ from opik.evaluation import evaluator as opik_evaluator
8
8
  from opik.evaluation.metrics.score_result import ScoreResult
9
9
 
10
10
  from ..base_optimizer import BaseOptimizer
@@ -16,7 +16,10 @@ from ..utils import (
16
16
  create_litellm_agent_class,
17
17
  disable_experiment_reporting,
18
18
  enable_experiment_reporting,
19
+ unique_ordered_by_key,
19
20
  )
21
+ from ..task_evaluator import _create_metric_class
22
+ from ..reporting_utils import suppress_opik_logs
20
23
  from .. import task_evaluator
21
24
  from . import reporting as gepa_reporting
22
25
  from .adapter import OpikDataInst, OpikGEPAAdapter
@@ -25,16 +28,30 @@ logger = logging.getLogger(__name__)
25
28
 
26
29
 
27
30
  class GepaOptimizer(BaseOptimizer):
28
- """Minimal integration against the upstream GEPA engine."""
31
+ """
32
+ The GEPA (Genetic-Pareto) Optimizer uses a genetic algorithm with Pareto optimization
33
+ to improve prompts while balancing multiple objectives.
34
+
35
+ This algorithm is well-suited for complex optimization tasks where you want to find
36
+ prompts that balance trade-offs between different quality metrics.
37
+
38
+ Args:
39
+ model: LiteLLM model name for the optimization algorithm
40
+ model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
41
+ Common params: temperature, max_tokens, max_completion_tokens, top_p.
42
+ See: https://docs.litellm.ai/docs/completion/input
43
+ n_threads: Number of parallel threads for evaluation
44
+ verbose: Controls internal logging/progress bars (0=off, 1=on)
45
+ seed: Random seed for reproducibility
46
+ """
29
47
 
30
48
  def __init__(
31
49
  self,
32
- model: str,
33
- project_name: str | None = None,
34
- reflection_model: str | None = None,
50
+ model: str = "gpt-4o",
51
+ model_parameters: dict[str, Any] | None = None,
52
+ n_threads: int = 6,
35
53
  verbose: int = 1,
36
54
  seed: int = 42,
37
- **model_kwargs: Any,
38
55
  ) -> None:
39
56
  # Validate required parameters
40
57
  if model is None:
@@ -45,16 +62,6 @@ class GepaOptimizer(BaseOptimizer):
45
62
  raise ValueError("model cannot be empty or whitespace-only")
46
63
 
47
64
  # Validate optional parameters
48
- if project_name is not None and not isinstance(project_name, str):
49
- raise ValueError(
50
- f"project_name must be a string or None, got {type(project_name).__name__}"
51
- )
52
-
53
- if reflection_model is not None and not isinstance(reflection_model, str):
54
- raise ValueError(
55
- f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
56
- )
57
-
58
65
  if not isinstance(verbose, int):
59
66
  raise ValueError(
60
67
  f"verbose must be an integer, got {type(verbose).__name__}"
@@ -65,32 +72,19 @@ class GepaOptimizer(BaseOptimizer):
65
72
  if not isinstance(seed, int):
66
73
  raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
67
74
 
68
- super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
69
- self.project_name = project_name
70
- self.reflection_model = reflection_model or model
71
- self.num_threads = self.model_kwargs.pop("num_threads", 6)
75
+ super().__init__(
76
+ model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
77
+ )
78
+ self.n_threads = n_threads
72
79
  self._gepa_live_metric_calls = 0
73
80
  self._adapter = None # Will be set during optimization
74
81
 
75
82
  def get_optimizer_metadata(self) -> dict[str, Any]:
76
83
  return {
77
- "project_name": self.project_name,
78
- "reflection_model": self.reflection_model,
84
+ "model": self.model,
85
+ "n_threads": self.n_threads,
79
86
  }
80
87
 
81
- def cleanup(self) -> None:
82
- """
83
- Clean up GEPA-specific resources.
84
- """
85
- # Call parent cleanup
86
- super().cleanup()
87
-
88
- # Clear GEPA-specific resources
89
- self._adapter = None
90
- self._gepa_live_metric_calls = 0
91
-
92
- logger.debug("Cleaned up GEPA-specific resources")
93
-
94
88
  # ------------------------------------------------------------------
95
89
  # Helpers
96
90
  # ------------------------------------------------------------------
@@ -161,7 +155,19 @@ class GepaOptimizer(BaseOptimizer):
161
155
  n_samples: int | None = None,
162
156
  auto_continue: bool = False,
163
157
  agent_class: type[OptimizableAgent] | None = None,
164
- **kwargs: Any,
158
+ project_name: str = "Optimization",
159
+ max_trials: int = 10,
160
+ reflection_minibatch_size: int = 3,
161
+ candidate_selection_strategy: str = "pareto",
162
+ skip_perfect_score: bool = True,
163
+ perfect_score: float = 1.0,
164
+ use_merge: bool = False,
165
+ max_merge_invocations: int = 5,
166
+ run_dir: str | None = None,
167
+ track_best_outputs: bool = False,
168
+ display_progress_bar: bool = False,
169
+ seed: int = 42,
170
+ raise_on_exception: bool = True,
165
171
  ) -> OptimizationResult:
166
172
  """
167
173
  Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
@@ -171,54 +177,33 @@ class GepaOptimizer(BaseOptimizer):
171
177
  dataset: Opik Dataset to optimize on
172
178
  metric: Metric function to evaluate on
173
179
  experiment_config: Optional configuration for the experiment
180
+ max_trials: Maximum number of different prompts to test (default: 10)
174
181
  n_samples: Optional number of items to test in the dataset
175
182
  auto_continue: Whether to auto-continue optimization
176
183
  agent_class: Optional agent class to use
177
- **kwargs: GEPA-specific parameters:
178
- max_metric_calls (int | None): Maximum number of metric evaluations (default: 30)
179
- reflection_minibatch_size (int): Size of reflection minibatches (default: 3)
180
- candidate_selection_strategy (str): Strategy for candidate selection (default: "pareto")
181
- skip_perfect_score (bool): Skip candidates with perfect scores (default: True)
182
- perfect_score (float): Score considered perfect (default: 1.0)
183
- use_merge (bool): Enable merge operations (default: False)
184
- max_merge_invocations (int): Maximum merge invocations (default: 5)
185
- run_dir (str | None): Directory for run outputs (default: None)
186
- track_best_outputs (bool): Track best outputs during optimization (default: False)
187
- display_progress_bar (bool): Display progress bar (default: False)
188
- seed (int): Random seed for reproducibility (default: 42)
189
- raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
190
- mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
184
+ reflection_minibatch_size: Size of reflection minibatches (default: 3)
185
+ candidate_selection_strategy: Strategy for candidate selection (default: "pareto")
186
+ skip_perfect_score: Skip candidates with perfect scores (default: True)
187
+ perfect_score: Score considered perfect (default: 1.0)
188
+ use_merge: Enable merge operations (default: False)
189
+ max_merge_invocations: Maximum merge invocations (default: 5)
190
+ run_dir: Directory for run outputs (default: None)
191
+ track_best_outputs: Track best outputs during optimization (default: False)
192
+ display_progress_bar: Display progress bar (default: False)
193
+ seed: Random seed for reproducibility (default: 42)
194
+ raise_on_exception: Raise exceptions instead of continuing (default: True)
191
195
 
192
196
  Returns:
193
197
  OptimizationResult: Result of the optimization
194
198
  """
195
199
  # Use base class validation and setup methods
196
- self.validate_optimization_inputs(prompt, dataset, metric)
197
-
198
- # Extract GEPA-specific parameters from kwargs
199
- max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
200
- reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
201
- candidate_selection_strategy: str = str(
202
- kwargs.get("candidate_selection_strategy", "pareto")
203
- )
204
- skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
205
- perfect_score: float = float(kwargs.get("perfect_score", 1.0))
206
- use_merge: bool = kwargs.get("use_merge", False)
207
- max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
208
- run_dir: str | None = kwargs.get("run_dir", None)
209
- track_best_outputs: bool = kwargs.get("track_best_outputs", False)
210
- display_progress_bar: bool = kwargs.get("display_progress_bar", False)
211
- seed: int = int(kwargs.get("seed", 42))
212
- raise_on_exception: bool = kwargs.get("raise_on_exception", True)
213
- kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
200
+ self._validate_optimization_inputs(prompt, dataset, metric)
214
201
 
215
202
  prompt = prompt.copy()
216
- if self.project_name:
217
- prompt.project_name = self.project_name
218
203
  if prompt.model is None:
219
204
  prompt.model = self.model
220
205
  if not prompt.model_kwargs:
221
- prompt.model_kwargs = dict(self.model_kwargs)
206
+ prompt.model_kwargs = dict(self.model_parameters)
222
207
 
223
208
  seed_prompt_text = self._extract_system_text(prompt)
224
209
  input_key, output_key = self._infer_dataset_keys(dataset)
@@ -227,12 +212,38 @@ class GepaOptimizer(BaseOptimizer):
227
212
  if n_samples and 0 < n_samples < len(items):
228
213
  items = items[:n_samples]
229
214
 
215
+ # Calculate max_metric_calls from max_trials and effective samples
216
+ effective_n_samples = len(items)
217
+ max_metric_calls = max_trials * effective_n_samples
218
+ budget_limited_trials = (
219
+ max_metric_calls // effective_n_samples if effective_n_samples else 0
220
+ )
221
+ if reflection_minibatch_size > max_trials:
222
+ logger.warning(
223
+ "reflection_minibatch_size (%s) exceeds max_trials (%s); GEPA reflection will not run. "
224
+ "Increase max_trials or lower the minibatch.",
225
+ reflection_minibatch_size,
226
+ max_trials,
227
+ )
228
+ elif (
229
+ budget_limited_trials and reflection_minibatch_size > budget_limited_trials
230
+ ):
231
+ logger.warning(
232
+ "reflection_minibatch_size (%s) exceeds the number of candidates allowed by the metric budget (%s). "
233
+ "Consider increasing max_trials or n_samples.",
234
+ reflection_minibatch_size,
235
+ budget_limited_trials,
236
+ )
237
+
230
238
  data_insts = self._build_data_insts(items, input_key, output_key)
231
239
 
232
240
  self._gepa_live_metric_calls = 0
233
241
 
234
242
  base_prompt = prompt.copy()
235
243
 
244
+ # Set project name from parameter
245
+ self.project_name = project_name
246
+
236
247
  opt_id: str | None = None
237
248
  ds_id: str | None = getattr(dataset, "id", None)
238
249
 
@@ -249,8 +260,10 @@ class GepaOptimizer(BaseOptimizer):
249
260
  ) as optimization:
250
261
  try:
251
262
  opt_id = optimization.id if optimization is not None else None
263
+ self.current_optimization_id = opt_id
252
264
  except Exception:
253
265
  opt_id = None
266
+ self.current_optimization_id = None
254
267
 
255
268
  gepa_reporting.display_header(
256
269
  algorithm=self.__class__.__name__,
@@ -266,11 +279,11 @@ class GepaOptimizer(BaseOptimizer):
266
279
  optimizer_config={
267
280
  "optimizer": self.__class__.__name__,
268
281
  "model": self.model,
269
- "reflection_model": self.reflection_model,
282
+ "max_trials": max_trials,
283
+ "n_samples": n_samples or "all",
270
284
  "max_metric_calls": max_metric_calls,
271
285
  "reflection_minibatch_size": reflection_minibatch_size,
272
286
  "candidate_selection_strategy": candidate_selection_strategy,
273
- "n_samples": n_samples or "all",
274
287
  },
275
288
  verbose=self.verbose,
276
289
  )
@@ -280,15 +293,6 @@ class GepaOptimizer(BaseOptimizer):
280
293
  initial_score = 0.0
281
294
  with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
282
295
  try:
283
- baseline_suppress: ContextManager[Any] = nullcontext()
284
- try:
285
- from ..reporting_utils import (
286
- suppress_opik_logs as _suppress_logs,
287
- )
288
-
289
- baseline_suppress = _suppress_logs()
290
- except Exception:
291
- pass
292
296
  eval_kwargs = dict(
293
297
  prompt=prompt,
294
298
  dataset=dataset,
@@ -298,7 +302,7 @@ class GepaOptimizer(BaseOptimizer):
298
302
  extra_metadata={"phase": "baseline"},
299
303
  verbose=0,
300
304
  )
301
- with baseline_suppress:
305
+ with suppress_opik_logs():
302
306
  initial_score = float(
303
307
  self._evaluate_prompt_logged(**eval_kwargs)
304
308
  )
@@ -307,12 +311,11 @@ class GepaOptimizer(BaseOptimizer):
307
311
  logger.exception("Baseline evaluation failed")
308
312
 
309
313
  adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
310
- adapter_prompt.project_name = self.project_name
311
314
  adapter_prompt.model = self.model
312
315
  # Filter out GEPA-specific parameters that shouldn't be passed to LLM
313
316
  filtered_model_kwargs = {
314
317
  k: v
315
- for k, v in self.model_kwargs.items()
318
+ for k, v in self.model_parameters.items()
316
319
  if k not in ["num_prompts_per_round", "rounds"]
317
320
  }
318
321
  adapter_prompt.model_kwargs = filtered_model_kwargs
@@ -330,40 +333,52 @@ class GepaOptimizer(BaseOptimizer):
330
333
  except Exception as exc: # pragma: no cover
331
334
  raise ImportError("gepa package is required for GepaOptimizer") from exc
332
335
 
333
- kwargs_gepa: dict[str, Any] = {
334
- "seed_candidate": {"system_prompt": seed_prompt_text},
335
- "trainset": data_insts,
336
- "valset": data_insts,
337
- "adapter": adapter,
338
- "task_lm": None,
339
- "reflection_lm": self.reflection_model,
340
- "candidate_selection_strategy": candidate_selection_strategy,
341
- "skip_perfect_score": skip_perfect_score,
342
- "reflection_minibatch_size": reflection_minibatch_size,
343
- "perfect_score": perfect_score,
344
- "use_merge": use_merge,
345
- "max_merge_invocations": max_merge_invocations,
346
- "max_metric_calls": max_metric_calls,
347
- "run_dir": run_dir,
348
- "track_best_outputs": track_best_outputs,
349
- "display_progress_bar": display_progress_bar,
350
- "seed": seed,
351
- "raise_on_exception": raise_on_exception,
352
- "logger": gepa_reporting.RichGEPAOptimizerLogger(
353
- self, verbose=self.verbose
354
- ),
355
- }
336
+ # When using our Rich logger, disable GEPA's native progress bar to avoid conflicts
337
+ use_gepa_progress_bar = display_progress_bar if self.verbose == 0 else False
338
+
339
+ with gepa_reporting.start_gepa_optimization(
340
+ verbose=self.verbose, max_trials=max_trials
341
+ ) as reporter:
342
+ # Create logger with progress bar support
343
+ logger_instance = gepa_reporting.RichGEPAOptimizerLogger(
344
+ self,
345
+ verbose=self.verbose,
346
+ progress=reporter.progress,
347
+ task_id=reporter.task_id,
348
+ max_trials=max_trials,
349
+ )
350
+
351
+ kwargs_gepa: dict[str, Any] = {
352
+ "seed_candidate": {"system_prompt": seed_prompt_text},
353
+ "trainset": data_insts,
354
+ "valset": data_insts,
355
+ "adapter": adapter,
356
+ "task_lm": None,
357
+ "reflection_lm": self.model,
358
+ "candidate_selection_strategy": candidate_selection_strategy,
359
+ "skip_perfect_score": skip_perfect_score,
360
+ "reflection_minibatch_size": reflection_minibatch_size,
361
+ "perfect_score": perfect_score,
362
+ "use_merge": use_merge,
363
+ "max_merge_invocations": max_merge_invocations,
364
+ "max_metric_calls": max_metric_calls,
365
+ "run_dir": run_dir,
366
+ "track_best_outputs": track_best_outputs,
367
+ "display_progress_bar": use_gepa_progress_bar,
368
+ "seed": seed,
369
+ "raise_on_exception": raise_on_exception,
370
+ "logger": logger_instance,
371
+ }
356
372
 
357
- optimize_sig = None
358
- try:
359
- optimize_sig = inspect.signature(gepa.optimize)
360
- except Exception:
361
373
  optimize_sig = None
374
+ try:
375
+ optimize_sig = inspect.signature(gepa.optimize)
376
+ except Exception:
377
+ optimize_sig = None
362
378
 
363
- if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
364
- kwargs_gepa["max_metric_calls"] = max_metric_calls
379
+ if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
380
+ kwargs_gepa["max_metric_calls"] = max_metric_calls
365
381
 
366
- with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
367
382
  gepa_result = gepa.optimize(**kwargs_gepa)
368
383
 
369
384
  try:
@@ -381,110 +396,202 @@ class GepaOptimizer(BaseOptimizer):
381
396
  candidates: list[dict[str, str]] = getattr(gepa_result, "candidates", []) or []
382
397
  val_scores: list[float] = list(getattr(gepa_result, "val_aggregate_scores", []))
383
398
 
399
+ indexed_candidates: list[tuple[int, dict[str, str]]] = list(
400
+ enumerate(candidates)
401
+ )
402
+ filtered_indexed_candidates = unique_ordered_by_key(
403
+ indexed_candidates,
404
+ key=lambda item: self._extract_system_text_from_candidate(
405
+ item[1], seed_prompt_text
406
+ ).strip(),
407
+ )
408
+ filtered_candidates: list[dict[str, str]] = [
409
+ candidate for _, candidate in filtered_indexed_candidates
410
+ ]
411
+ filtered_val_scores: list[float | None] = [
412
+ val_scores[idx] if idx < len(val_scores) else None
413
+ for idx, _ in filtered_indexed_candidates
414
+ ]
415
+
384
416
  rescored: list[float] = []
385
417
  candidate_rows: list[dict[str, Any]] = []
386
418
  history: list[dict[str, Any]] = []
387
419
 
388
- for idx, candidate in enumerate(candidates):
389
- candidate_prompt = self._extract_system_text_from_candidate(
390
- candidate, seed_prompt_text
391
- )
392
- prompt_variant = self._apply_system_text(prompt, candidate_prompt)
393
- prompt_variant.project_name = self.project_name
394
- prompt_variant.model = self.model
395
- # Filter out GEPA-specific parameters that shouldn't be passed to LLM
396
- filtered_model_kwargs = {
397
- k: v
398
- for k, v in self.model_kwargs.items()
399
- if k not in ["num_prompts_per_round", "rounds"]
400
- }
401
- prompt_variant.model_kwargs = filtered_model_kwargs
420
+ # Import convert_tqdm_to_rich for suppressing display functions
421
+ from ..reporting_utils import convert_tqdm_to_rich
422
+
423
+ # Wrap rescoring to prevent OPIK messages and experiment link displays
424
+ with suppress_opik_logs():
425
+ with convert_tqdm_to_rich(verbose=0):
426
+ for idx, (original_idx, candidate) in enumerate(
427
+ filtered_indexed_candidates
428
+ ):
429
+ candidate_prompt = self._extract_system_text_from_candidate(
430
+ candidate, seed_prompt_text
431
+ )
432
+ prompt_variant = self._apply_system_text(prompt, candidate_prompt)
433
+ prompt_variant.model = self.model
434
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
435
+ filtered_model_kwargs = {
436
+ k: v
437
+ for k, v in self.model_parameters.items()
438
+ if k not in ["num_prompts_per_round", "rounds"]
439
+ }
440
+ prompt_variant.model_kwargs = filtered_model_kwargs
402
441
 
403
- eval_kwargs = dict(
404
- prompt=prompt_variant,
405
- dataset=dataset,
406
- metric=metric,
407
- n_samples=n_samples,
408
- optimization_id=opt_id,
409
- extra_metadata={"phase": "rescoring", "candidate_index": idx},
410
- verbose=0,
411
- )
412
- try:
413
- score = float(self._evaluate_prompt_logged(**eval_kwargs))
414
- except Exception:
415
- logger.debug("Rescoring failed for candidate %s", idx, exc_info=True)
416
- score = 0.0
417
-
418
- rescored.append(score)
419
- candidate_rows.append(
420
- {
421
- "iteration": idx + 1,
422
- "system_prompt": candidate_prompt,
423
- "gepa_score": val_scores[idx] if idx < len(val_scores) else None,
424
- "opik_score": score,
425
- "source": self.__class__.__name__,
426
- }
427
- )
428
- history.append(
429
- {
430
- "iteration": idx + 1,
431
- "prompt_candidate": candidate_prompt,
432
- "scores": [
442
+ eval_kwargs = dict(
443
+ prompt=prompt_variant,
444
+ dataset=dataset,
445
+ metric=metric,
446
+ n_samples=n_samples,
447
+ optimization_id=opt_id,
448
+ extra_metadata={"phase": "rescoring", "candidate_index": idx},
449
+ verbose=0,
450
+ )
451
+ try:
452
+ score = float(self._evaluate_prompt_logged(**eval_kwargs))
453
+ except Exception:
454
+ logger.debug(
455
+ "Rescoring failed for candidate %s", idx, exc_info=True
456
+ )
457
+ score = 0.0
458
+
459
+ rescored.append(score)
460
+ candidate_rows.append(
433
461
  {
434
- "metric_name": f"GEPA-{metric.__name__}",
435
- "score": val_scores[idx] if idx < len(val_scores) else None,
436
- },
437
- {"metric_name": metric.__name__, "score": score},
438
- ],
439
- "metadata": {},
440
- }
441
- )
462
+ "iteration": idx + 1,
463
+ "system_prompt": candidate_prompt,
464
+ "gepa_score": filtered_val_scores[idx],
465
+ "opik_score": score,
466
+ "source": self.__class__.__name__,
467
+ }
468
+ )
469
+ history.append(
470
+ {
471
+ "iteration": idx + 1,
472
+ "prompt_candidate": candidate_prompt,
473
+ "scores": [
474
+ {
475
+ "metric_name": f"GEPA-{metric.__name__}",
476
+ "score": filtered_val_scores[idx],
477
+ },
478
+ {"metric_name": metric.__name__, "score": score},
479
+ ],
480
+ "metadata": {},
481
+ }
482
+ )
442
483
 
443
484
  if rescored:
444
- best_idx = max(range(len(rescored)), key=lambda i: rescored[i])
485
+
486
+ def _tie_break(idx: int) -> tuple[float, float, int]:
487
+ opik_score = rescored[idx]
488
+ gepa_score = filtered_val_scores[idx]
489
+ gepa_numeric = (
490
+ float(gepa_score)
491
+ if isinstance(gepa_score, (int, float))
492
+ else float("-inf")
493
+ )
494
+ return opik_score, gepa_numeric, idx
495
+
496
+ best_idx = max(range(len(rescored)), key=_tie_break)
445
497
  best_score = rescored[best_idx]
446
498
  else:
447
- best_idx = getattr(gepa_result, "best_idx", 0) or 0
448
- best_score = float(val_scores[best_idx]) if val_scores else 0.0
499
+ if filtered_indexed_candidates:
500
+ gepa_best_idx = getattr(gepa_result, "best_idx", 0) or 0
501
+ best_idx = next(
502
+ (
503
+ i
504
+ for i, (original_idx, _) in enumerate(
505
+ filtered_indexed_candidates
506
+ )
507
+ if original_idx == gepa_best_idx
508
+ ),
509
+ 0,
510
+ )
511
+ if filtered_val_scores and 0 <= best_idx < len(filtered_val_scores):
512
+ score_value = filtered_val_scores[best_idx]
513
+ best_score = float(score_value) if score_value is not None else 0.0
514
+ else:
515
+ best_score = float(initial_score)
516
+ else:
517
+ best_idx = 0
518
+ best_score = float(initial_score)
449
519
 
450
520
  best_candidate = (
451
- candidates[best_idx] if candidates else {"system_prompt": seed_prompt_text}
521
+ filtered_candidates[best_idx]
522
+ if filtered_candidates
523
+ else {"system_prompt": seed_prompt_text}
452
524
  )
453
525
  best_prompt_text = self._extract_system_text_from_candidate(
454
526
  best_candidate, seed_prompt_text
455
527
  )
456
528
 
457
529
  final_prompt = self._apply_system_text(prompt, best_prompt_text)
458
- final_prompt.project_name = self.project_name
459
530
  final_prompt.model = self.model
460
531
  # Filter out GEPA-specific parameters that shouldn't be passed to LLM
461
532
  filtered_model_kwargs = {
462
533
  k: v
463
- for k, v in self.model_kwargs.items()
534
+ for k, v in self.model_parameters.items()
464
535
  if k not in ["num_prompts_per_round", "rounds"]
465
536
  }
466
537
  final_prompt.model_kwargs = filtered_model_kwargs
467
538
 
468
- final_eval_kwargs = dict(
469
- prompt=final_prompt,
470
- dataset=dataset,
471
- metric=metric,
472
- n_samples=n_samples,
473
- optimization_id=opt_id,
474
- extra_metadata={"phase": "final", "selected": True},
475
- verbose=0,
476
- )
477
- suppress_logs: ContextManager[Any] = nullcontext()
478
- try:
479
- from ..reporting_utils import suppress_opik_logs as _suppress_logs
480
-
481
- suppress_logs = _suppress_logs()
482
- except Exception:
483
- pass
539
+ final_eval_result: Any | None = None
484
540
 
485
- with suppress_logs:
541
+ with suppress_opik_logs():
486
542
  try:
487
- self._evaluate_prompt_logged(**final_eval_kwargs)
543
+ final_agent_cls = create_litellm_agent_class(
544
+ final_prompt, optimizer_ref=self
545
+ )
546
+ final_agent = final_agent_cls(final_prompt)
547
+
548
+ def final_llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
549
+ messages = final_prompt.get_messages(dataset_item)
550
+ raw = final_agent.invoke(messages)
551
+ if self.current_optimization_id:
552
+ opik_context.update_current_trace(
553
+ tags=[self.current_optimization_id, "Evaluation"]
554
+ )
555
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
556
+
557
+ configuration_updates = self._drop_none(
558
+ {"gepa": {"phase": "final", "selected": True}}
559
+ )
560
+ final_experiment_config = self._prepare_experiment_config(
561
+ prompt=final_prompt,
562
+ dataset=dataset,
563
+ metric=metric,
564
+ experiment_config=experiment_config,
565
+ configuration_updates=configuration_updates,
566
+ )
567
+
568
+ metric_class = _create_metric_class(metric)
569
+
570
+ if opt_id:
571
+ final_eval_result = opik_evaluator.evaluate_optimization_trial(
572
+ optimization_id=opt_id,
573
+ dataset=dataset,
574
+ task=final_llm_task,
575
+ project_name=final_experiment_config.get("project_name"),
576
+ dataset_item_ids=None,
577
+ scoring_metrics=[metric_class],
578
+ task_threads=self.n_threads,
579
+ nb_samples=n_samples,
580
+ experiment_config=final_experiment_config,
581
+ verbose=0,
582
+ )
583
+ else:
584
+ final_eval_result = opik_evaluator.evaluate(
585
+ dataset=dataset,
586
+ task=final_llm_task,
587
+ project_name=final_experiment_config.get("project_name"),
588
+ dataset_item_ids=None,
589
+ scoring_metrics=[metric_class],
590
+ task_threads=self.n_threads,
591
+ nb_samples=n_samples,
592
+ experiment_config=final_experiment_config,
593
+ verbose=0,
594
+ )
488
595
  except Exception:
489
596
  logger.debug("Final evaluation failed", exc_info=True)
490
597
 
@@ -514,28 +621,55 @@ class GepaOptimizer(BaseOptimizer):
514
621
  except Exception:
515
622
  logger.debug("Per-item diagnostics failed", exc_info=True)
516
623
 
624
+ trial_info: dict[str, Any] | None = None
625
+ if final_eval_result is not None:
626
+ experiment_name = getattr(final_eval_result, "experiment_name", None)
627
+ experiment_url = getattr(final_eval_result, "experiment_url", None)
628
+ trial_ids = []
629
+ try:
630
+ trial_ids = sorted(
631
+ {
632
+ str(test_result.trial_id)
633
+ for test_result in getattr(
634
+ final_eval_result, "test_results", []
635
+ )
636
+ if getattr(test_result, "trial_id", None) is not None
637
+ }
638
+ )
639
+ except Exception:
640
+ logger.debug("Failed to extract trial IDs", exc_info=True)
641
+
642
+ trial_info = {
643
+ "experiment_name": experiment_name,
644
+ "experiment_url": experiment_url,
645
+ "trial_ids": trial_ids,
646
+ }
647
+
517
648
  details: dict[str, Any] = {
518
649
  "model": self.model,
519
- "temperature": self.model_kwargs.get("temperature"),
650
+ "temperature": self.model_parameters.get("temperature"),
520
651
  "optimizer": self.__class__.__name__,
521
- "num_candidates": getattr(gepa_result, "num_candidates", None),
652
+ "num_candidates": len(filtered_candidates),
522
653
  "total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
523
654
  "parents": getattr(gepa_result, "parents", None),
524
- "val_scores": val_scores,
655
+ "val_scores": filtered_val_scores,
525
656
  "opik_rescored_scores": rescored,
526
657
  "candidate_summary": candidate_rows,
527
658
  "best_candidate_iteration": (
528
659
  candidate_rows[best_idx]["iteration"] if candidate_rows else 0
529
660
  ),
530
- "selected_candidate_index": best_idx,
661
+ "selected_candidate_index": best_idx if filtered_candidates else None,
531
662
  "selected_candidate_gepa_score": (
532
- val_scores[best_idx] if best_idx < len(val_scores) else None
663
+ filtered_val_scores[best_idx]
664
+ if filtered_val_scores and 0 <= best_idx < len(filtered_val_scores)
665
+ else None
533
666
  ),
534
667
  "selected_candidate_opik_score": best_score,
535
668
  "gepa_live_metric_used": True,
536
669
  "gepa_live_metric_call_count": self._gepa_live_metric_calls,
537
670
  "selected_candidate_item_scores": per_item_scores,
538
671
  "dataset_item_ids": [item.get("id") for item in items],
672
+ "selected_candidate_trial_info": trial_info,
539
673
  }
540
674
  if experiment_config:
541
675
  details["experiment"] = experiment_config
@@ -547,7 +681,10 @@ class GepaOptimizer(BaseOptimizer):
547
681
  candidate_rows, verbose=self.verbose
548
682
  )
549
683
  gepa_reporting.display_selected_candidate(
550
- best_prompt_text, best_score, verbose=self.verbose
684
+ best_prompt_text,
685
+ best_score,
686
+ verbose=self.verbose,
687
+ trial_info=trial_info,
551
688
  )
552
689
 
553
690
  if logger.isEnabledFor(logging.DEBUG):
@@ -618,7 +755,7 @@ class GepaOptimizer(BaseOptimizer):
618
755
  if prompt.model is None:
619
756
  prompt.model = self.model
620
757
  if prompt.model_kwargs is None:
621
- prompt.model_kwargs = self.model_kwargs
758
+ prompt.model_kwargs = self.model_parameters
622
759
 
623
760
  agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
624
761
  self.agent_class = agent_class
@@ -627,6 +764,13 @@ class GepaOptimizer(BaseOptimizer):
627
764
  def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
628
765
  messages = prompt.get_messages(dataset_item)
629
766
  raw = agent.invoke(messages)
767
+
768
+ # Add tags to trace for optimization tracking
769
+ if self.current_optimization_id:
770
+ opik_context.update_current_trace(
771
+ tags=[self.current_optimization_id, "Evaluation"]
772
+ )
773
+
630
774
  return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
631
775
 
632
776
  configuration_updates = self._drop_none({"gepa": extra_metadata})
@@ -643,7 +787,7 @@ class GepaOptimizer(BaseOptimizer):
643
787
  dataset_item_ids=dataset_item_ids,
644
788
  metric=metric,
645
789
  evaluated_task=llm_task,
646
- num_threads=self.num_threads,
790
+ num_threads=self.n_threads,
647
791
  project_name=experiment_config.get("project_name"),
648
792
  experiment_config=experiment_config,
649
793
  optimization_id=optimization_id,