opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. opik_optimizer/__init__.py +4 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +402 -28
  4. opik_optimizer/data/context7_eval.jsonl +3 -0
  5. opik_optimizer/datasets/context7_eval.py +90 -0
  6. opik_optimizer/datasets/tiny_test.py +33 -34
  7. opik_optimizer/datasets/truthful_qa.py +2 -2
  8. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  9. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
  10. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
  11. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  12. opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
  13. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +154 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +30 -23
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +21 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +22 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/utils/colbert.py +236 -0
  46. opik_optimizer/{utils.py → utils/core.py} +160 -33
  47. opik_optimizer/utils/dataset_utils.py +49 -0
  48. opik_optimizer/utils/prompt_segments.py +186 -0
  49. opik_optimizer-2.0.0.dist-info/METADATA +345 -0
  50. opik_optimizer-2.0.0.dist-info/RECORD +74 -0
  51. opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
  52. opik_optimizer-1.0.6.dist-info/METADATA +0 -181
  53. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  54. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  55. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
  56. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,653 @@
1
+ import logging
2
+ from contextlib import nullcontext
3
+ from typing import Any, ContextManager
4
+ from collections.abc import Callable
5
+
6
+ import opik
7
+ from opik import Dataset
8
+ from opik.evaluation.metrics.score_result import ScoreResult
9
+
10
+ from ..base_optimizer import BaseOptimizer
11
+ from ..optimization_config import chat_prompt, mappers
12
+ from ..optimization_result import OptimizationResult
13
+ from ..optimizable_agent import OptimizableAgent
14
+ from ..utils import (
15
+ optimization_context,
16
+ create_litellm_agent_class,
17
+ disable_experiment_reporting,
18
+ enable_experiment_reporting,
19
+ )
20
+ from .. import task_evaluator
21
+ from . import reporting as gepa_reporting
22
+ from .adapter import OpikDataInst, OpikGEPAAdapter
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class GepaOptimizer(BaseOptimizer):
28
+ """Minimal integration against the upstream GEPA engine."""
29
+
30
+ def __init__(
31
+ self,
32
+ model: str,
33
+ project_name: str | None = None,
34
+ reflection_model: str | None = None,
35
+ verbose: int = 1,
36
+ seed: int = 42,
37
+ **model_kwargs: Any,
38
+ ) -> None:
39
+ # Validate required parameters
40
+ if model is None:
41
+ raise ValueError("model parameter is required and cannot be None")
42
+ if not isinstance(model, str):
43
+ raise ValueError(f"model must be a string, got {type(model).__name__}")
44
+ if not model.strip():
45
+ raise ValueError("model cannot be empty or whitespace-only")
46
+
47
+ # Validate optional parameters
48
+ if project_name is not None and not isinstance(project_name, str):
49
+ raise ValueError(
50
+ f"project_name must be a string or None, got {type(project_name).__name__}"
51
+ )
52
+
53
+ if reflection_model is not None and not isinstance(reflection_model, str):
54
+ raise ValueError(
55
+ f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
56
+ )
57
+
58
+ if not isinstance(verbose, int):
59
+ raise ValueError(
60
+ f"verbose must be an integer, got {type(verbose).__name__}"
61
+ )
62
+ if verbose < 0:
63
+ raise ValueError("verbose must be non-negative")
64
+
65
+ if not isinstance(seed, int):
66
+ raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
67
+
68
+ super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
69
+ self.project_name = project_name
70
+ self.reflection_model = reflection_model or model
71
+ self.num_threads = self.model_kwargs.pop("num_threads", 6)
72
+ self._gepa_live_metric_calls = 0
73
+ self._adapter = None # Will be set during optimization
74
+
75
+ def get_optimizer_metadata(self) -> dict[str, Any]:
76
+ return {
77
+ "project_name": self.project_name,
78
+ "reflection_model": self.reflection_model,
79
+ }
80
+
81
+ def cleanup(self) -> None:
82
+ """
83
+ Clean up GEPA-specific resources.
84
+ """
85
+ # Call parent cleanup
86
+ super().cleanup()
87
+
88
+ # Clear GEPA-specific resources
89
+ self._adapter = None
90
+ self._gepa_live_metric_calls = 0
91
+
92
+ logger.debug("Cleaned up GEPA-specific resources")
93
+
94
+ # ------------------------------------------------------------------
95
+ # Helpers
96
+ # ------------------------------------------------------------------
97
+
98
+ def _build_data_insts(
99
+ self,
100
+ dataset_items: list[dict[str, Any]],
101
+ input_key: str,
102
+ output_key: str,
103
+ ) -> list[OpikDataInst]:
104
+ data_insts: list[OpikDataInst] = []
105
+ for item in dataset_items:
106
+ additional_context: dict[str, str] = {}
107
+ metadata = item.get("metadata") or {}
108
+ if isinstance(metadata, dict):
109
+ context_value = metadata.get("context")
110
+ if isinstance(context_value, str):
111
+ additional_context["context"] = context_value
112
+ if "context" in item and isinstance(item["context"], str):
113
+ additional_context.setdefault("context", item["context"])
114
+
115
+ data_insts.append(
116
+ OpikDataInst(
117
+ input_text=str(item.get(input_key, "")),
118
+ answer=str(item.get(output_key, "")),
119
+ additional_context=additional_context,
120
+ opik_item=item,
121
+ )
122
+ )
123
+ return data_insts
124
+
125
+ def _apply_system_text(
126
+ self, prompt_obj: chat_prompt.ChatPrompt, system_text: str
127
+ ) -> chat_prompt.ChatPrompt:
128
+ updated = prompt_obj.copy()
129
+ if updated.messages is not None:
130
+ messages = updated.get_messages()
131
+ if messages and messages[0].get("role") == "system":
132
+ messages[0]["content"] = system_text
133
+ else:
134
+ messages.insert(0, {"role": "system", "content": system_text})
135
+ updated.set_messages(messages)
136
+ else:
137
+ updated.system = system_text
138
+ return updated
139
+
140
+ def _infer_dataset_keys(self, dataset: Dataset) -> tuple[str, str]:
141
+ items = dataset.get_items(1)
142
+ if not items:
143
+ return "text", "label"
144
+ sample = items[0]
145
+ output_candidates = ["label", "answer", "output", "expected_output"]
146
+ output_key = next((k for k in output_candidates if k in sample), "label")
147
+ excluded = {output_key, "id", "metadata"}
148
+ input_key = next((k for k in sample.keys() if k not in excluded), "text")
149
+ return input_key, output_key
150
+
151
+ # ------------------------------------------------------------------
152
+ # Base optimizer overrides
153
+ # ------------------------------------------------------------------
154
+
155
+ def optimize_prompt(
156
+ self,
157
+ prompt: chat_prompt.ChatPrompt,
158
+ dataset: Dataset,
159
+ metric: Callable,
160
+ experiment_config: dict | None = None,
161
+ n_samples: int | None = None,
162
+ auto_continue: bool = False,
163
+ agent_class: type[OptimizableAgent] | None = None,
164
+ **kwargs: Any,
165
+ ) -> OptimizationResult:
166
+ """
167
+ Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
168
+
169
+ Args:
170
+ prompt: The prompt to optimize
171
+ dataset: Opik Dataset to optimize on
172
+ metric: Metric function to evaluate on
173
+ experiment_config: Optional configuration for the experiment
174
+ n_samples: Optional number of items to test in the dataset
175
+ auto_continue: Whether to auto-continue optimization
176
+ agent_class: Optional agent class to use
177
+ **kwargs: GEPA-specific parameters:
178
+ max_metric_calls (int | None): Maximum number of metric evaluations (default: 30)
179
+ reflection_minibatch_size (int): Size of reflection minibatches (default: 3)
180
+ candidate_selection_strategy (str): Strategy for candidate selection (default: "pareto")
181
+ skip_perfect_score (bool): Skip candidates with perfect scores (default: True)
182
+ perfect_score (float): Score considered perfect (default: 1.0)
183
+ use_merge (bool): Enable merge operations (default: False)
184
+ max_merge_invocations (int): Maximum merge invocations (default: 5)
185
+ run_dir (str | None): Directory for run outputs (default: None)
186
+ track_best_outputs (bool): Track best outputs during optimization (default: False)
187
+ display_progress_bar (bool): Display progress bar (default: False)
188
+ seed (int): Random seed for reproducibility (default: 42)
189
+ raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
190
+ mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
191
+
192
+ Returns:
193
+ OptimizationResult: Result of the optimization
194
+ """
195
+ # Use base class validation and setup methods
196
+ self.validate_optimization_inputs(prompt, dataset, metric)
197
+
198
+ # Extract GEPA-specific parameters from kwargs
199
+ max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
200
+ reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
201
+ candidate_selection_strategy: str = str(
202
+ kwargs.get("candidate_selection_strategy", "pareto")
203
+ )
204
+ skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
205
+ perfect_score: float = float(kwargs.get("perfect_score", 1.0))
206
+ use_merge: bool = kwargs.get("use_merge", False)
207
+ max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
208
+ run_dir: str | None = kwargs.get("run_dir", None)
209
+ track_best_outputs: bool = kwargs.get("track_best_outputs", False)
210
+ display_progress_bar: bool = kwargs.get("display_progress_bar", False)
211
+ seed: int = int(kwargs.get("seed", 42))
212
+ raise_on_exception: bool = kwargs.get("raise_on_exception", True)
213
+ kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
214
+
215
+ prompt = prompt.copy()
216
+ if self.project_name:
217
+ prompt.project_name = self.project_name
218
+ if prompt.model is None:
219
+ prompt.model = self.model
220
+ if not prompt.model_kwargs:
221
+ prompt.model_kwargs = dict(self.model_kwargs)
222
+
223
+ seed_prompt_text = self._extract_system_text(prompt)
224
+ input_key, output_key = self._infer_dataset_keys(dataset)
225
+
226
+ items = dataset.get_items()
227
+ if n_samples and 0 < n_samples < len(items):
228
+ items = items[:n_samples]
229
+
230
+ data_insts = self._build_data_insts(items, input_key, output_key)
231
+
232
+ self._gepa_live_metric_calls = 0
233
+
234
+ base_prompt = prompt.copy()
235
+
236
+ opt_id: str | None = None
237
+ ds_id: str | None = getattr(dataset, "id", None)
238
+
239
+ opik_client = opik.Opik(project_name=self.project_name)
240
+
241
+ disable_experiment_reporting()
242
+
243
+ try:
244
+ with optimization_context(
245
+ client=opik_client,
246
+ dataset_name=dataset.name,
247
+ objective_name=metric.__name__,
248
+ metadata={"optimizer": self.__class__.__name__},
249
+ ) as optimization:
250
+ try:
251
+ opt_id = optimization.id if optimization is not None else None
252
+ except Exception:
253
+ opt_id = None
254
+
255
+ gepa_reporting.display_header(
256
+ algorithm="GEPA",
257
+ optimization_id=opt_id,
258
+ dataset_id=getattr(dataset, "id", None),
259
+ verbose=self.verbose,
260
+ )
261
+
262
+ from ..reporting_utils import display_configuration as _display_config
263
+
264
+ _display_config(
265
+ messages=prompt.get_messages(),
266
+ optimizer_config={
267
+ "optimizer": "GEPA",
268
+ "model": self.model,
269
+ "reflection_model": self.reflection_model,
270
+ "max_metric_calls": max_metric_calls,
271
+ "reflection_minibatch_size": reflection_minibatch_size,
272
+ "candidate_selection_strategy": candidate_selection_strategy,
273
+ "n_samples": n_samples or "all",
274
+ },
275
+ verbose=self.verbose,
276
+ )
277
+
278
+ # Baseline evaluation
279
+ initial_prompt_messages = prompt.get_messages()
280
+ initial_score = 0.0
281
+ with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
282
+ try:
283
+ baseline_suppress: ContextManager[Any] = nullcontext()
284
+ try:
285
+ from ..reporting_utils import (
286
+ suppress_opik_logs as _suppress_logs,
287
+ )
288
+
289
+ baseline_suppress = _suppress_logs()
290
+ except Exception:
291
+ pass
292
+ eval_kwargs = dict(
293
+ prompt=prompt,
294
+ dataset=dataset,
295
+ metric=metric,
296
+ n_samples=n_samples,
297
+ optimization_id=opt_id,
298
+ extra_metadata={"phase": "baseline"},
299
+ verbose=0,
300
+ )
301
+ with baseline_suppress:
302
+ initial_score = float(
303
+ self._evaluate_prompt_logged(**eval_kwargs)
304
+ )
305
+ baseline.set_score(initial_score)
306
+ except Exception:
307
+ logger.exception("Baseline evaluation failed")
308
+
309
+ adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
310
+ adapter_prompt.project_name = self.project_name
311
+ adapter_prompt.model = self.model
312
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
313
+ filtered_model_kwargs = {
314
+ k: v
315
+ for k, v in self.model_kwargs.items()
316
+ if k not in ["num_prompts_per_round", "rounds"]
317
+ }
318
+ adapter_prompt.model_kwargs = filtered_model_kwargs
319
+
320
+ adapter = OpikGEPAAdapter(
321
+ base_prompt=adapter_prompt,
322
+ optimizer=self,
323
+ metric=metric,
324
+ system_fallback=seed_prompt_text,
325
+ )
326
+
327
+ try:
328
+ import gepa
329
+ import inspect
330
+ except Exception as exc: # pragma: no cover
331
+ raise ImportError("gepa package is required for GepaOptimizer") from exc
332
+
333
+ kwargs_gepa: dict[str, Any] = {
334
+ "seed_candidate": {"system_prompt": seed_prompt_text},
335
+ "trainset": data_insts,
336
+ "valset": data_insts,
337
+ "adapter": adapter,
338
+ "task_lm": None,
339
+ "reflection_lm": self.reflection_model,
340
+ "candidate_selection_strategy": candidate_selection_strategy,
341
+ "skip_perfect_score": skip_perfect_score,
342
+ "reflection_minibatch_size": reflection_minibatch_size,
343
+ "perfect_score": perfect_score,
344
+ "use_merge": use_merge,
345
+ "max_merge_invocations": max_merge_invocations,
346
+ "max_metric_calls": max_metric_calls,
347
+ "run_dir": run_dir,
348
+ "track_best_outputs": track_best_outputs,
349
+ "display_progress_bar": display_progress_bar,
350
+ "seed": seed,
351
+ "raise_on_exception": raise_on_exception,
352
+ "logger": gepa_reporting.RichGEPAOptimizerLogger(
353
+ self, verbose=self.verbose
354
+ ),
355
+ }
356
+
357
+ optimize_sig = None
358
+ try:
359
+ optimize_sig = inspect.signature(gepa.optimize)
360
+ except Exception:
361
+ optimize_sig = None
362
+
363
+ if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
364
+ kwargs_gepa["max_metric_calls"] = max_metric_calls
365
+
366
+ with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
367
+ gepa_result = gepa.optimize(**kwargs_gepa)
368
+
369
+ try:
370
+ opt_id = optimization.id if optimization is not None else None
371
+ except Exception:
372
+ opt_id = None
373
+
374
+ finally:
375
+ enable_experiment_reporting()
376
+
377
+ # ------------------------------------------------------------------
378
+ # Rescoring & result assembly
379
+ # ------------------------------------------------------------------
380
+
381
+ candidates: list[dict[str, str]] = getattr(gepa_result, "candidates", []) or []
382
+ val_scores: list[float] = list(getattr(gepa_result, "val_aggregate_scores", []))
383
+
384
+ rescored: list[float] = []
385
+ candidate_rows: list[dict[str, Any]] = []
386
+ history: list[dict[str, Any]] = []
387
+
388
+ for idx, candidate in enumerate(candidates):
389
+ candidate_prompt = self._extract_system_text_from_candidate(
390
+ candidate, seed_prompt_text
391
+ )
392
+ prompt_variant = self._apply_system_text(prompt, candidate_prompt)
393
+ prompt_variant.project_name = self.project_name
394
+ prompt_variant.model = self.model
395
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
396
+ filtered_model_kwargs = {
397
+ k: v
398
+ for k, v in self.model_kwargs.items()
399
+ if k not in ["num_prompts_per_round", "rounds"]
400
+ }
401
+ prompt_variant.model_kwargs = filtered_model_kwargs
402
+
403
+ eval_kwargs = dict(
404
+ prompt=prompt_variant,
405
+ dataset=dataset,
406
+ metric=metric,
407
+ n_samples=n_samples,
408
+ optimization_id=opt_id,
409
+ extra_metadata={"phase": "rescoring", "candidate_index": idx},
410
+ verbose=0,
411
+ )
412
+ try:
413
+ score = float(self._evaluate_prompt_logged(**eval_kwargs))
414
+ except Exception:
415
+ logger.debug("Rescoring failed for candidate %s", idx, exc_info=True)
416
+ score = 0.0
417
+
418
+ rescored.append(score)
419
+ candidate_rows.append(
420
+ {
421
+ "iteration": idx + 1,
422
+ "system_prompt": candidate_prompt,
423
+ "gepa_score": val_scores[idx] if idx < len(val_scores) else None,
424
+ "opik_score": score,
425
+ "source": "GEPA",
426
+ }
427
+ )
428
+ history.append(
429
+ {
430
+ "iteration": idx + 1,
431
+ "prompt_candidate": candidate_prompt,
432
+ "scores": [
433
+ {
434
+ "metric_name": f"GEPA-{metric.__name__}",
435
+ "score": val_scores[idx] if idx < len(val_scores) else None,
436
+ },
437
+ {"metric_name": metric.__name__, "score": score},
438
+ ],
439
+ "metadata": {},
440
+ }
441
+ )
442
+
443
+ if rescored:
444
+ best_idx = max(range(len(rescored)), key=lambda i: rescored[i])
445
+ best_score = rescored[best_idx]
446
+ else:
447
+ best_idx = getattr(gepa_result, "best_idx", 0) or 0
448
+ best_score = float(val_scores[best_idx]) if val_scores else 0.0
449
+
450
+ best_candidate = (
451
+ candidates[best_idx] if candidates else {"system_prompt": seed_prompt_text}
452
+ )
453
+ best_prompt_text = self._extract_system_text_from_candidate(
454
+ best_candidate, seed_prompt_text
455
+ )
456
+
457
+ final_prompt = self._apply_system_text(prompt, best_prompt_text)
458
+ final_prompt.project_name = self.project_name
459
+ final_prompt.model = self.model
460
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
461
+ filtered_model_kwargs = {
462
+ k: v
463
+ for k, v in self.model_kwargs.items()
464
+ if k not in ["num_prompts_per_round", "rounds"]
465
+ }
466
+ final_prompt.model_kwargs = filtered_model_kwargs
467
+
468
+ final_eval_kwargs = dict(
469
+ prompt=final_prompt,
470
+ dataset=dataset,
471
+ metric=metric,
472
+ n_samples=n_samples,
473
+ optimization_id=opt_id,
474
+ extra_metadata={"phase": "final", "selected": True},
475
+ verbose=0,
476
+ )
477
+ suppress_logs: ContextManager[Any] = nullcontext()
478
+ try:
479
+ from ..reporting_utils import suppress_opik_logs as _suppress_logs
480
+
481
+ suppress_logs = _suppress_logs()
482
+ except Exception:
483
+ pass
484
+
485
+ with suppress_logs:
486
+ try:
487
+ self._evaluate_prompt_logged(**final_eval_kwargs)
488
+ except Exception:
489
+ logger.debug("Final evaluation failed", exc_info=True)
490
+
491
+ per_item_scores: list[dict[str, Any]] = []
492
+ try:
493
+ analysis_prompt = final_prompt.copy()
494
+ agent_cls = create_litellm_agent_class(analysis_prompt, optimizer_ref=self)
495
+ agent = agent_cls(analysis_prompt)
496
+ for item in items:
497
+ messages = analysis_prompt.get_messages(item)
498
+ output_text = agent.invoke(messages).strip()
499
+ metric_result = metric(item, output_text)
500
+ if hasattr(metric_result, "value"):
501
+ score_val = float(metric_result.value)
502
+ elif hasattr(metric_result, "score"):
503
+ score_val = float(metric_result.score)
504
+ else:
505
+ score_val = float(metric_result)
506
+ per_item_scores.append(
507
+ {
508
+ "dataset_item_id": item.get("id"),
509
+ "score": score_val,
510
+ "answer": item.get(output_key),
511
+ "output": output_text,
512
+ }
513
+ )
514
+ except Exception:
515
+ logger.debug("Per-item diagnostics failed", exc_info=True)
516
+
517
+ details: dict[str, Any] = {
518
+ "model": self.model,
519
+ "temperature": self.model_kwargs.get("temperature"),
520
+ "optimizer": self.__class__.__name__,
521
+ "num_candidates": getattr(gepa_result, "num_candidates", None),
522
+ "total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
523
+ "parents": getattr(gepa_result, "parents", None),
524
+ "val_scores": val_scores,
525
+ "opik_rescored_scores": rescored,
526
+ "candidate_summary": candidate_rows,
527
+ "best_candidate_iteration": (
528
+ candidate_rows[best_idx]["iteration"] if candidate_rows else 0
529
+ ),
530
+ "selected_candidate_index": best_idx,
531
+ "selected_candidate_gepa_score": (
532
+ val_scores[best_idx] if best_idx < len(val_scores) else None
533
+ ),
534
+ "selected_candidate_opik_score": best_score,
535
+ "gepa_live_metric_used": True,
536
+ "gepa_live_metric_call_count": self._gepa_live_metric_calls,
537
+ "selected_candidate_item_scores": per_item_scores,
538
+ "dataset_item_ids": [item.get("id") for item in items],
539
+ }
540
+ if experiment_config:
541
+ details["experiment"] = experiment_config
542
+
543
+ final_messages = final_prompt.get_messages()
544
+
545
+ if self.verbose >= 1:
546
+ gepa_reporting.display_candidate_scores(
547
+ candidate_rows, verbose=self.verbose
548
+ )
549
+ gepa_reporting.display_selected_candidate(
550
+ best_prompt_text, best_score, verbose=self.verbose
551
+ )
552
+
553
+ if logger.isEnabledFor(logging.DEBUG):
554
+ for idx, row in enumerate(candidate_rows):
555
+ logger.debug(
556
+ "candidate=%s source=%s gepa=%s opik=%s",
557
+ idx,
558
+ row.get("source"),
559
+ row.get("gepa_score"),
560
+ row.get("opik_score"),
561
+ )
562
+ logger.debug(
563
+ "selected candidate idx=%s gepa=%s opik=%.4f",
564
+ best_idx,
565
+ details.get("selected_candidate_gepa_score"),
566
+ best_score,
567
+ )
568
+
569
+ return OptimizationResult(
570
+ optimizer=self.__class__.__name__,
571
+ prompt=final_messages,
572
+ score=best_score,
573
+ metric_name=metric.__name__,
574
+ optimization_id=opt_id,
575
+ dataset_id=ds_id,
576
+ initial_prompt=initial_prompt_messages,
577
+ initial_score=initial_score,
578
+ details=details,
579
+ history=history,
580
+ llm_calls=None,
581
+ )
582
+
583
+ # ------------------------------------------------------------------
584
+ # Helpers used by BaseOptimizer.evaluate_prompt
585
+ # ------------------------------------------------------------------
586
+
587
+ def _extract_system_text(self, prompt: chat_prompt.ChatPrompt) -> str:
588
+ messages = prompt.get_messages()
589
+ for message in messages:
590
+ if message.get("role") == "system":
591
+ return str(message.get("content", "")).strip()
592
+ for message in messages:
593
+ if message.get("role") == "user":
594
+ return f"You are a helpful assistant. Respond to: {message.get('content', '')}"
595
+ return "You are a helpful assistant."
596
+
597
+ def _extract_system_text_from_candidate(
598
+ self, candidate: dict[str, Any], fallback: str
599
+ ) -> str:
600
+ for key in ("system_prompt", "system", "prompt"):
601
+ value = candidate.get(key)
602
+ if isinstance(value, str) and value.strip():
603
+ return value
604
+ return fallback
605
+
606
+ def _evaluate_prompt_logged(
607
+ self,
608
+ prompt: chat_prompt.ChatPrompt,
609
+ dataset: Dataset,
610
+ metric: Callable[[dict[str, Any], str], ScoreResult],
611
+ n_samples: int | None = None,
612
+ dataset_item_ids: list[str] | None = None,
613
+ experiment_config: dict[str, Any] | None = None,
614
+ optimization_id: str | None = None,
615
+ extra_metadata: dict[str, Any] | None = None,
616
+ verbose: int = 1,
617
+ ) -> float:
618
+ if prompt.model is None:
619
+ prompt.model = self.model
620
+ if prompt.model_kwargs is None:
621
+ prompt.model_kwargs = self.model_kwargs
622
+
623
+ agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
624
+ self.agent_class = agent_class
625
+ agent = agent_class(prompt)
626
+
627
+ def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
628
+ messages = prompt.get_messages(dataset_item)
629
+ raw = agent.invoke(messages)
630
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
631
+
632
+ configuration_updates = self._drop_none({"gepa": extra_metadata})
633
+ experiment_config = self._prepare_experiment_config(
634
+ prompt=prompt,
635
+ dataset=dataset,
636
+ metric=metric,
637
+ experiment_config=experiment_config,
638
+ configuration_updates=configuration_updates,
639
+ )
640
+
641
+ score = task_evaluator.evaluate(
642
+ dataset=dataset,
643
+ dataset_item_ids=dataset_item_ids,
644
+ metric=metric,
645
+ evaluated_task=llm_task,
646
+ num_threads=self.num_threads,
647
+ project_name=experiment_config.get("project_name"),
648
+ experiment_config=experiment_config,
649
+ optimization_id=optimization_id,
650
+ n_samples=n_samples,
651
+ verbose=verbose,
652
+ )
653
+ return score