opik-optimizer 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. opik_optimizer/__init__.py +0 -2
  2. opik_optimizer/base_optimizer.py +314 -145
  3. opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
  4. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
  5. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
  6. opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
  9. opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
  10. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
  11. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
  12. opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
  13. opik_optimizer/gepa_optimizer/reporting.py +164 -22
  14. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
  15. opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
  16. opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
  17. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
  18. opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
  19. opik_optimizer/mipro_optimizer/__init__.py +2 -2
  20. opik_optimizer/mipro_optimizer/_lm.py +4 -4
  21. opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
  22. opik_optimizer/mipro_optimizer/utils.py +1 -0
  23. opik_optimizer/optimizable_agent.py +7 -4
  24. opik_optimizer/optimization_config/chat_prompt.py +7 -10
  25. opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
  26. opik_optimizer/parameter_optimizer/reporting.py +148 -0
  27. opik_optimizer/reporting_utils.py +42 -15
  28. opik_optimizer/utils/core.py +16 -2
  29. opik_optimizer/utils/prompt_segments.py +1 -2
  30. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
  31. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +34 -35
  32. opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
  33. opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
  34. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
  35. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
  36. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ from typing import Any, ContextManager
4
4
  from collections.abc import Callable
5
5
 
6
6
  import opik
7
- from opik import Dataset
7
+ from opik import Dataset, opik_context
8
8
  from opik.evaluation.metrics.score_result import ScoreResult
9
9
 
10
10
  from ..base_optimizer import BaseOptimizer
@@ -17,6 +17,7 @@ from ..utils import (
17
17
  disable_experiment_reporting,
18
18
  enable_experiment_reporting,
19
19
  )
20
+ from ..reporting_utils import suppress_opik_logs
20
21
  from .. import task_evaluator
21
22
  from . import reporting as gepa_reporting
22
23
  from .adapter import OpikDataInst, OpikGEPAAdapter
@@ -25,16 +26,30 @@ logger = logging.getLogger(__name__)
25
26
 
26
27
 
27
28
  class GepaOptimizer(BaseOptimizer):
28
- """Minimal integration against the upstream GEPA engine."""
29
+ """
30
+ The GEPA (Genetic-Pareto) Optimizer uses a genetic algorithm with Pareto optimization
31
+ to improve prompts while balancing multiple objectives.
32
+
33
+ This algorithm is well-suited for complex optimization tasks where you want to find
34
+ prompts that balance trade-offs between different quality metrics.
35
+
36
+ Args:
37
+ model: LiteLLM model name for the optimization algorithm
38
+ model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
39
+ Common params: temperature, max_tokens, max_completion_tokens, top_p.
40
+ See: https://docs.litellm.ai/docs/completion/input
41
+ n_threads: Number of parallel threads for evaluation
42
+ verbose: Controls internal logging/progress bars (0=off, 1=on)
43
+ seed: Random seed for reproducibility
44
+ """
29
45
 
30
46
  def __init__(
31
47
  self,
32
- model: str,
33
- project_name: str | None = None,
34
- reflection_model: str | None = None,
48
+ model: str = "gpt-4o",
49
+ model_parameters: dict[str, Any] | None = None,
50
+ n_threads: int = 6,
35
51
  verbose: int = 1,
36
52
  seed: int = 42,
37
- **model_kwargs: Any,
38
53
  ) -> None:
39
54
  # Validate required parameters
40
55
  if model is None:
@@ -45,16 +60,6 @@ class GepaOptimizer(BaseOptimizer):
45
60
  raise ValueError("model cannot be empty or whitespace-only")
46
61
 
47
62
  # Validate optional parameters
48
- if project_name is not None and not isinstance(project_name, str):
49
- raise ValueError(
50
- f"project_name must be a string or None, got {type(project_name).__name__}"
51
- )
52
-
53
- if reflection_model is not None and not isinstance(reflection_model, str):
54
- raise ValueError(
55
- f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
56
- )
57
-
58
63
  if not isinstance(verbose, int):
59
64
  raise ValueError(
60
65
  f"verbose must be an integer, got {type(verbose).__name__}"
@@ -65,32 +70,19 @@ class GepaOptimizer(BaseOptimizer):
65
70
  if not isinstance(seed, int):
66
71
  raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
67
72
 
68
- super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
69
- self.project_name = project_name
70
- self.reflection_model = reflection_model or model
71
- self.num_threads = self.model_kwargs.pop("num_threads", 6)
73
+ super().__init__(
74
+ model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
75
+ )
76
+ self.n_threads = n_threads
72
77
  self._gepa_live_metric_calls = 0
73
78
  self._adapter = None # Will be set during optimization
74
79
 
75
80
  def get_optimizer_metadata(self) -> dict[str, Any]:
76
81
  return {
77
- "project_name": self.project_name,
78
- "reflection_model": self.reflection_model,
82
+ "model": self.model,
83
+ "n_threads": self.n_threads,
79
84
  }
80
85
 
81
- def cleanup(self) -> None:
82
- """
83
- Clean up GEPA-specific resources.
84
- """
85
- # Call parent cleanup
86
- super().cleanup()
87
-
88
- # Clear GEPA-specific resources
89
- self._adapter = None
90
- self._gepa_live_metric_calls = 0
91
-
92
- logger.debug("Cleaned up GEPA-specific resources")
93
-
94
86
  # ------------------------------------------------------------------
95
87
  # Helpers
96
88
  # ------------------------------------------------------------------
@@ -161,7 +153,19 @@ class GepaOptimizer(BaseOptimizer):
161
153
  n_samples: int | None = None,
162
154
  auto_continue: bool = False,
163
155
  agent_class: type[OptimizableAgent] | None = None,
164
- **kwargs: Any,
156
+ project_name: str = "Optimization",
157
+ max_trials: int = 10,
158
+ reflection_minibatch_size: int = 3,
159
+ candidate_selection_strategy: str = "pareto",
160
+ skip_perfect_score: bool = True,
161
+ perfect_score: float = 1.0,
162
+ use_merge: bool = False,
163
+ max_merge_invocations: int = 5,
164
+ run_dir: str | None = None,
165
+ track_best_outputs: bool = False,
166
+ display_progress_bar: bool = False,
167
+ seed: int = 42,
168
+ raise_on_exception: bool = True,
165
169
  ) -> OptimizationResult:
166
170
  """
167
171
  Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
@@ -171,54 +175,33 @@ class GepaOptimizer(BaseOptimizer):
171
175
  dataset: Opik Dataset to optimize on
172
176
  metric: Metric function to evaluate on
173
177
  experiment_config: Optional configuration for the experiment
178
+ max_trials: Maximum number of different prompts to test (default: 10)
174
179
  n_samples: Optional number of items to test in the dataset
175
180
  auto_continue: Whether to auto-continue optimization
176
181
  agent_class: Optional agent class to use
177
- **kwargs: GEPA-specific parameters:
178
- max_metric_calls (int | None): Maximum number of metric evaluations (default: 30)
179
- reflection_minibatch_size (int): Size of reflection minibatches (default: 3)
180
- candidate_selection_strategy (str): Strategy for candidate selection (default: "pareto")
181
- skip_perfect_score (bool): Skip candidates with perfect scores (default: True)
182
- perfect_score (float): Score considered perfect (default: 1.0)
183
- use_merge (bool): Enable merge operations (default: False)
184
- max_merge_invocations (int): Maximum merge invocations (default: 5)
185
- run_dir (str | None): Directory for run outputs (default: None)
186
- track_best_outputs (bool): Track best outputs during optimization (default: False)
187
- display_progress_bar (bool): Display progress bar (default: False)
188
- seed (int): Random seed for reproducibility (default: 42)
189
- raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
190
- mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
182
+ reflection_minibatch_size: Size of reflection minibatches (default: 3)
183
+ candidate_selection_strategy: Strategy for candidate selection (default: "pareto")
184
+ skip_perfect_score: Skip candidates with perfect scores (default: True)
185
+ perfect_score: Score considered perfect (default: 1.0)
186
+ use_merge: Enable merge operations (default: False)
187
+ max_merge_invocations: Maximum merge invocations (default: 5)
188
+ run_dir: Directory for run outputs (default: None)
189
+ track_best_outputs: Track best outputs during optimization (default: False)
190
+ display_progress_bar: Display progress bar (default: False)
191
+ seed: Random seed for reproducibility (default: 42)
192
+ raise_on_exception: Raise exceptions instead of continuing (default: True)
191
193
 
192
194
  Returns:
193
195
  OptimizationResult: Result of the optimization
194
196
  """
195
197
  # Use base class validation and setup methods
196
- self.validate_optimization_inputs(prompt, dataset, metric)
197
-
198
- # Extract GEPA-specific parameters from kwargs
199
- max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
200
- reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
201
- candidate_selection_strategy: str = str(
202
- kwargs.get("candidate_selection_strategy", "pareto")
203
- )
204
- skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
205
- perfect_score: float = float(kwargs.get("perfect_score", 1.0))
206
- use_merge: bool = kwargs.get("use_merge", False)
207
- max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
208
- run_dir: str | None = kwargs.get("run_dir", None)
209
- track_best_outputs: bool = kwargs.get("track_best_outputs", False)
210
- display_progress_bar: bool = kwargs.get("display_progress_bar", False)
211
- seed: int = int(kwargs.get("seed", 42))
212
- raise_on_exception: bool = kwargs.get("raise_on_exception", True)
213
- kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
198
+ self._validate_optimization_inputs(prompt, dataset, metric)
214
199
 
215
200
  prompt = prompt.copy()
216
- if self.project_name:
217
- prompt.project_name = self.project_name
218
201
  if prompt.model is None:
219
202
  prompt.model = self.model
220
203
  if not prompt.model_kwargs:
221
- prompt.model_kwargs = dict(self.model_kwargs)
204
+ prompt.model_kwargs = dict(self.model_parameters)
222
205
 
223
206
  seed_prompt_text = self._extract_system_text(prompt)
224
207
  input_key, output_key = self._infer_dataset_keys(dataset)
@@ -227,12 +210,19 @@ class GepaOptimizer(BaseOptimizer):
227
210
  if n_samples and 0 < n_samples < len(items):
228
211
  items = items[:n_samples]
229
212
 
213
+ # Calculate max_metric_calls from max_trials and effective samples
214
+ effective_n_samples = len(items)
215
+ max_metric_calls = max_trials * effective_n_samples
216
+
230
217
  data_insts = self._build_data_insts(items, input_key, output_key)
231
218
 
232
219
  self._gepa_live_metric_calls = 0
233
220
 
234
221
  base_prompt = prompt.copy()
235
222
 
223
+ # Set project name from parameter
224
+ self.project_name = project_name
225
+
236
226
  opt_id: str | None = None
237
227
  ds_id: str | None = getattr(dataset, "id", None)
238
228
 
@@ -249,8 +239,10 @@ class GepaOptimizer(BaseOptimizer):
249
239
  ) as optimization:
250
240
  try:
251
241
  opt_id = optimization.id if optimization is not None else None
242
+ self.current_optimization_id = opt_id
252
243
  except Exception:
253
244
  opt_id = None
245
+ self.current_optimization_id = None
254
246
 
255
247
  gepa_reporting.display_header(
256
248
  algorithm=self.__class__.__name__,
@@ -266,11 +258,11 @@ class GepaOptimizer(BaseOptimizer):
266
258
  optimizer_config={
267
259
  "optimizer": self.__class__.__name__,
268
260
  "model": self.model,
269
- "reflection_model": self.reflection_model,
261
+ "max_trials": max_trials,
262
+ "n_samples": n_samples or "all",
270
263
  "max_metric_calls": max_metric_calls,
271
264
  "reflection_minibatch_size": reflection_minibatch_size,
272
265
  "candidate_selection_strategy": candidate_selection_strategy,
273
- "n_samples": n_samples or "all",
274
266
  },
275
267
  verbose=self.verbose,
276
268
  )
@@ -280,15 +272,6 @@ class GepaOptimizer(BaseOptimizer):
280
272
  initial_score = 0.0
281
273
  with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
282
274
  try:
283
- baseline_suppress: ContextManager[Any] = nullcontext()
284
- try:
285
- from ..reporting_utils import (
286
- suppress_opik_logs as _suppress_logs,
287
- )
288
-
289
- baseline_suppress = _suppress_logs()
290
- except Exception:
291
- pass
292
275
  eval_kwargs = dict(
293
276
  prompt=prompt,
294
277
  dataset=dataset,
@@ -298,7 +281,7 @@ class GepaOptimizer(BaseOptimizer):
298
281
  extra_metadata={"phase": "baseline"},
299
282
  verbose=0,
300
283
  )
301
- with baseline_suppress:
284
+ with suppress_opik_logs():
302
285
  initial_score = float(
303
286
  self._evaluate_prompt_logged(**eval_kwargs)
304
287
  )
@@ -307,12 +290,11 @@ class GepaOptimizer(BaseOptimizer):
307
290
  logger.exception("Baseline evaluation failed")
308
291
 
309
292
  adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
310
- adapter_prompt.project_name = self.project_name
311
293
  adapter_prompt.model = self.model
312
294
  # Filter out GEPA-specific parameters that shouldn't be passed to LLM
313
295
  filtered_model_kwargs = {
314
296
  k: v
315
- for k, v in self.model_kwargs.items()
297
+ for k, v in self.model_parameters.items()
316
298
  if k not in ["num_prompts_per_round", "rounds"]
317
299
  }
318
300
  adapter_prompt.model_kwargs = filtered_model_kwargs
@@ -330,40 +312,52 @@ class GepaOptimizer(BaseOptimizer):
330
312
  except Exception as exc: # pragma: no cover
331
313
  raise ImportError("gepa package is required for GepaOptimizer") from exc
332
314
 
333
- kwargs_gepa: dict[str, Any] = {
334
- "seed_candidate": {"system_prompt": seed_prompt_text},
335
- "trainset": data_insts,
336
- "valset": data_insts,
337
- "adapter": adapter,
338
- "task_lm": None,
339
- "reflection_lm": self.reflection_model,
340
- "candidate_selection_strategy": candidate_selection_strategy,
341
- "skip_perfect_score": skip_perfect_score,
342
- "reflection_minibatch_size": reflection_minibatch_size,
343
- "perfect_score": perfect_score,
344
- "use_merge": use_merge,
345
- "max_merge_invocations": max_merge_invocations,
346
- "max_metric_calls": max_metric_calls,
347
- "run_dir": run_dir,
348
- "track_best_outputs": track_best_outputs,
349
- "display_progress_bar": display_progress_bar,
350
- "seed": seed,
351
- "raise_on_exception": raise_on_exception,
352
- "logger": gepa_reporting.RichGEPAOptimizerLogger(
353
- self, verbose=self.verbose
354
- ),
355
- }
315
+ # When using our Rich logger, disable GEPA's native progress bar to avoid conflicts
316
+ use_gepa_progress_bar = display_progress_bar if self.verbose == 0 else False
317
+
318
+ with gepa_reporting.start_gepa_optimization(
319
+ verbose=self.verbose, max_trials=max_trials
320
+ ) as reporter:
321
+ # Create logger with progress bar support
322
+ logger_instance = gepa_reporting.RichGEPAOptimizerLogger(
323
+ self,
324
+ verbose=self.verbose,
325
+ progress=reporter.progress,
326
+ task_id=reporter.task_id,
327
+ max_trials=max_trials,
328
+ )
329
+
330
+ kwargs_gepa: dict[str, Any] = {
331
+ "seed_candidate": {"system_prompt": seed_prompt_text},
332
+ "trainset": data_insts,
333
+ "valset": data_insts,
334
+ "adapter": adapter,
335
+ "task_lm": None,
336
+ "reflection_lm": self.model,
337
+ "candidate_selection_strategy": candidate_selection_strategy,
338
+ "skip_perfect_score": skip_perfect_score,
339
+ "reflection_minibatch_size": reflection_minibatch_size,
340
+ "perfect_score": perfect_score,
341
+ "use_merge": use_merge,
342
+ "max_merge_invocations": max_merge_invocations,
343
+ "max_metric_calls": max_metric_calls,
344
+ "run_dir": run_dir,
345
+ "track_best_outputs": track_best_outputs,
346
+ "display_progress_bar": use_gepa_progress_bar,
347
+ "seed": seed,
348
+ "raise_on_exception": raise_on_exception,
349
+ "logger": logger_instance,
350
+ }
356
351
 
357
- optimize_sig = None
358
- try:
359
- optimize_sig = inspect.signature(gepa.optimize)
360
- except Exception:
361
352
  optimize_sig = None
353
+ try:
354
+ optimize_sig = inspect.signature(gepa.optimize)
355
+ except Exception:
356
+ optimize_sig = None
362
357
 
363
- if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
364
- kwargs_gepa["max_metric_calls"] = max_metric_calls
358
+ if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
359
+ kwargs_gepa["max_metric_calls"] = max_metric_calls
365
360
 
366
- with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
367
361
  gepa_result = gepa.optimize(**kwargs_gepa)
368
362
 
369
363
  try:
@@ -385,60 +379,71 @@ class GepaOptimizer(BaseOptimizer):
385
379
  candidate_rows: list[dict[str, Any]] = []
386
380
  history: list[dict[str, Any]] = []
387
381
 
388
- for idx, candidate in enumerate(candidates):
389
- candidate_prompt = self._extract_system_text_from_candidate(
390
- candidate, seed_prompt_text
391
- )
392
- prompt_variant = self._apply_system_text(prompt, candidate_prompt)
393
- prompt_variant.project_name = self.project_name
394
- prompt_variant.model = self.model
395
- # Filter out GEPA-specific parameters that shouldn't be passed to LLM
396
- filtered_model_kwargs = {
397
- k: v
398
- for k, v in self.model_kwargs.items()
399
- if k not in ["num_prompts_per_round", "rounds"]
400
- }
401
- prompt_variant.model_kwargs = filtered_model_kwargs
382
+ # Import convert_tqdm_to_rich for suppressing display functions
383
+ from ..reporting_utils import convert_tqdm_to_rich
402
384
 
403
- eval_kwargs = dict(
404
- prompt=prompt_variant,
405
- dataset=dataset,
406
- metric=metric,
407
- n_samples=n_samples,
408
- optimization_id=opt_id,
409
- extra_metadata={"phase": "rescoring", "candidate_index": idx},
410
- verbose=0,
411
- )
412
- try:
413
- score = float(self._evaluate_prompt_logged(**eval_kwargs))
414
- except Exception:
415
- logger.debug("Rescoring failed for candidate %s", idx, exc_info=True)
416
- score = 0.0
417
-
418
- rescored.append(score)
419
- candidate_rows.append(
420
- {
421
- "iteration": idx + 1,
422
- "system_prompt": candidate_prompt,
423
- "gepa_score": val_scores[idx] if idx < len(val_scores) else None,
424
- "opik_score": score,
425
- "source": self.__class__.__name__,
426
- }
427
- )
428
- history.append(
429
- {
430
- "iteration": idx + 1,
431
- "prompt_candidate": candidate_prompt,
432
- "scores": [
385
+ # Wrap rescoring to prevent OPIK messages and experiment link displays
386
+ with suppress_opik_logs():
387
+ with convert_tqdm_to_rich(verbose=0):
388
+ for idx, candidate in enumerate(candidates):
389
+ candidate_prompt = self._extract_system_text_from_candidate(
390
+ candidate, seed_prompt_text
391
+ )
392
+ prompt_variant = self._apply_system_text(prompt, candidate_prompt)
393
+ prompt_variant.model = self.model
394
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
395
+ filtered_model_kwargs = {
396
+ k: v
397
+ for k, v in self.model_parameters.items()
398
+ if k not in ["num_prompts_per_round", "rounds"]
399
+ }
400
+ prompt_variant.model_kwargs = filtered_model_kwargs
401
+
402
+ eval_kwargs = dict(
403
+ prompt=prompt_variant,
404
+ dataset=dataset,
405
+ metric=metric,
406
+ n_samples=n_samples,
407
+ optimization_id=opt_id,
408
+ extra_metadata={"phase": "rescoring", "candidate_index": idx},
409
+ verbose=0,
410
+ )
411
+ try:
412
+ score = float(self._evaluate_prompt_logged(**eval_kwargs))
413
+ except Exception:
414
+ logger.debug(
415
+ "Rescoring failed for candidate %s", idx, exc_info=True
416
+ )
417
+ score = 0.0
418
+
419
+ rescored.append(score)
420
+ candidate_rows.append(
433
421
  {
434
- "metric_name": f"GEPA-{metric.__name__}",
435
- "score": val_scores[idx] if idx < len(val_scores) else None,
436
- },
437
- {"metric_name": metric.__name__, "score": score},
438
- ],
439
- "metadata": {},
440
- }
441
- )
422
+ "iteration": idx + 1,
423
+ "system_prompt": candidate_prompt,
424
+ "gepa_score": val_scores[idx]
425
+ if idx < len(val_scores)
426
+ else None,
427
+ "opik_score": score,
428
+ "source": self.__class__.__name__,
429
+ }
430
+ )
431
+ history.append(
432
+ {
433
+ "iteration": idx + 1,
434
+ "prompt_candidate": candidate_prompt,
435
+ "scores": [
436
+ {
437
+ "metric_name": f"GEPA-{metric.__name__}",
438
+ "score": val_scores[idx]
439
+ if idx < len(val_scores)
440
+ else None,
441
+ },
442
+ {"metric_name": metric.__name__, "score": score},
443
+ ],
444
+ "metadata": {},
445
+ }
446
+ )
442
447
 
443
448
  if rescored:
444
449
  best_idx = max(range(len(rescored)), key=lambda i: rescored[i])
@@ -455,12 +460,11 @@ class GepaOptimizer(BaseOptimizer):
455
460
  )
456
461
 
457
462
  final_prompt = self._apply_system_text(prompt, best_prompt_text)
458
- final_prompt.project_name = self.project_name
459
463
  final_prompt.model = self.model
460
464
  # Filter out GEPA-specific parameters that shouldn't be passed to LLM
461
465
  filtered_model_kwargs = {
462
466
  k: v
463
- for k, v in self.model_kwargs.items()
467
+ for k, v in self.model_parameters.items()
464
468
  if k not in ["num_prompts_per_round", "rounds"]
465
469
  }
466
470
  final_prompt.model_kwargs = filtered_model_kwargs
@@ -516,7 +520,7 @@ class GepaOptimizer(BaseOptimizer):
516
520
 
517
521
  details: dict[str, Any] = {
518
522
  "model": self.model,
519
- "temperature": self.model_kwargs.get("temperature"),
523
+ "temperature": self.model_parameters.get("temperature"),
520
524
  "optimizer": self.__class__.__name__,
521
525
  "num_candidates": getattr(gepa_result, "num_candidates", None),
522
526
  "total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
@@ -618,7 +622,7 @@ class GepaOptimizer(BaseOptimizer):
618
622
  if prompt.model is None:
619
623
  prompt.model = self.model
620
624
  if prompt.model_kwargs is None:
621
- prompt.model_kwargs = self.model_kwargs
625
+ prompt.model_kwargs = self.model_parameters
622
626
 
623
627
  agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
624
628
  self.agent_class = agent_class
@@ -627,6 +631,13 @@ class GepaOptimizer(BaseOptimizer):
627
631
  def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
628
632
  messages = prompt.get_messages(dataset_item)
629
633
  raw = agent.invoke(messages)
634
+
635
+ # Add tags to trace for optimization tracking
636
+ if self.current_optimization_id:
637
+ opik_context.update_current_trace(
638
+ tags=[self.current_optimization_id, "Evaluation"]
639
+ )
640
+
630
641
  return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
631
642
 
632
643
  configuration_updates = self._drop_none({"gepa": extra_metadata})
@@ -643,7 +654,7 @@ class GepaOptimizer(BaseOptimizer):
643
654
  dataset_item_ids=dataset_item_ids,
644
655
  metric=metric,
645
656
  evaluated_task=llm_task,
646
- num_threads=self.num_threads,
657
+ num_threads=self.n_threads,
647
658
  project_name=experiment_config.get("project_name"),
648
659
  experiment_config=experiment_config,
649
660
  optimization_id=optimization_id,