opik-optimizer 2.1.3__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +0 -2
- opik_optimizer/base_optimizer.py +313 -144
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +345 -201
- opik_optimizer/gepa_optimizer/reporting.py +291 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +60 -15
- opik_optimizer/utils/__init__.py +3 -0
- opik_optimizer/utils/candidate_utils.py +52 -0
- opik_optimizer/utils/core.py +35 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/RECORD +36 -36
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any, ContextManager
|
|
2
|
+
from typing import Any
|
|
4
3
|
from collections.abc import Callable
|
|
5
4
|
|
|
6
5
|
import opik
|
|
7
|
-
from opik import Dataset
|
|
6
|
+
from opik import Dataset, opik_context
|
|
7
|
+
from opik.evaluation import evaluator as opik_evaluator
|
|
8
8
|
from opik.evaluation.metrics.score_result import ScoreResult
|
|
9
9
|
|
|
10
10
|
from ..base_optimizer import BaseOptimizer
|
|
@@ -16,7 +16,10 @@ from ..utils import (
|
|
|
16
16
|
create_litellm_agent_class,
|
|
17
17
|
disable_experiment_reporting,
|
|
18
18
|
enable_experiment_reporting,
|
|
19
|
+
unique_ordered_by_key,
|
|
19
20
|
)
|
|
21
|
+
from ..task_evaluator import _create_metric_class
|
|
22
|
+
from ..reporting_utils import suppress_opik_logs
|
|
20
23
|
from .. import task_evaluator
|
|
21
24
|
from . import reporting as gepa_reporting
|
|
22
25
|
from .adapter import OpikDataInst, OpikGEPAAdapter
|
|
@@ -25,16 +28,30 @@ logger = logging.getLogger(__name__)
|
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
class GepaOptimizer(BaseOptimizer):
|
|
28
|
-
"""
|
|
31
|
+
"""
|
|
32
|
+
The GEPA (Genetic-Pareto) Optimizer uses a genetic algorithm with Pareto optimization
|
|
33
|
+
to improve prompts while balancing multiple objectives.
|
|
34
|
+
|
|
35
|
+
This algorithm is well-suited for complex optimization tasks where you want to find
|
|
36
|
+
prompts that balance trade-offs between different quality metrics.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
model: LiteLLM model name for the optimization algorithm
|
|
40
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
41
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p.
|
|
42
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
43
|
+
n_threads: Number of parallel threads for evaluation
|
|
44
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
45
|
+
seed: Random seed for reproducibility
|
|
46
|
+
"""
|
|
29
47
|
|
|
30
48
|
def __init__(
|
|
31
49
|
self,
|
|
32
|
-
model: str,
|
|
33
|
-
|
|
34
|
-
|
|
50
|
+
model: str = "gpt-4o",
|
|
51
|
+
model_parameters: dict[str, Any] | None = None,
|
|
52
|
+
n_threads: int = 6,
|
|
35
53
|
verbose: int = 1,
|
|
36
54
|
seed: int = 42,
|
|
37
|
-
**model_kwargs: Any,
|
|
38
55
|
) -> None:
|
|
39
56
|
# Validate required parameters
|
|
40
57
|
if model is None:
|
|
@@ -45,16 +62,6 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
45
62
|
raise ValueError("model cannot be empty or whitespace-only")
|
|
46
63
|
|
|
47
64
|
# Validate optional parameters
|
|
48
|
-
if project_name is not None and not isinstance(project_name, str):
|
|
49
|
-
raise ValueError(
|
|
50
|
-
f"project_name must be a string or None, got {type(project_name).__name__}"
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
if reflection_model is not None and not isinstance(reflection_model, str):
|
|
54
|
-
raise ValueError(
|
|
55
|
-
f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
|
|
56
|
-
)
|
|
57
|
-
|
|
58
65
|
if not isinstance(verbose, int):
|
|
59
66
|
raise ValueError(
|
|
60
67
|
f"verbose must be an integer, got {type(verbose).__name__}"
|
|
@@ -65,32 +72,19 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
65
72
|
if not isinstance(seed, int):
|
|
66
73
|
raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
|
|
67
74
|
|
|
68
|
-
super().__init__(
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
self.
|
|
75
|
+
super().__init__(
|
|
76
|
+
model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
|
|
77
|
+
)
|
|
78
|
+
self.n_threads = n_threads
|
|
72
79
|
self._gepa_live_metric_calls = 0
|
|
73
80
|
self._adapter = None # Will be set during optimization
|
|
74
81
|
|
|
75
82
|
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
76
83
|
return {
|
|
77
|
-
"
|
|
78
|
-
"
|
|
84
|
+
"model": self.model,
|
|
85
|
+
"n_threads": self.n_threads,
|
|
79
86
|
}
|
|
80
87
|
|
|
81
|
-
def cleanup(self) -> None:
|
|
82
|
-
"""
|
|
83
|
-
Clean up GEPA-specific resources.
|
|
84
|
-
"""
|
|
85
|
-
# Call parent cleanup
|
|
86
|
-
super().cleanup()
|
|
87
|
-
|
|
88
|
-
# Clear GEPA-specific resources
|
|
89
|
-
self._adapter = None
|
|
90
|
-
self._gepa_live_metric_calls = 0
|
|
91
|
-
|
|
92
|
-
logger.debug("Cleaned up GEPA-specific resources")
|
|
93
|
-
|
|
94
88
|
# ------------------------------------------------------------------
|
|
95
89
|
# Helpers
|
|
96
90
|
# ------------------------------------------------------------------
|
|
@@ -161,7 +155,19 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
161
155
|
n_samples: int | None = None,
|
|
162
156
|
auto_continue: bool = False,
|
|
163
157
|
agent_class: type[OptimizableAgent] | None = None,
|
|
164
|
-
|
|
158
|
+
project_name: str = "Optimization",
|
|
159
|
+
max_trials: int = 10,
|
|
160
|
+
reflection_minibatch_size: int = 3,
|
|
161
|
+
candidate_selection_strategy: str = "pareto",
|
|
162
|
+
skip_perfect_score: bool = True,
|
|
163
|
+
perfect_score: float = 1.0,
|
|
164
|
+
use_merge: bool = False,
|
|
165
|
+
max_merge_invocations: int = 5,
|
|
166
|
+
run_dir: str | None = None,
|
|
167
|
+
track_best_outputs: bool = False,
|
|
168
|
+
display_progress_bar: bool = False,
|
|
169
|
+
seed: int = 42,
|
|
170
|
+
raise_on_exception: bool = True,
|
|
165
171
|
) -> OptimizationResult:
|
|
166
172
|
"""
|
|
167
173
|
Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
|
|
@@ -171,54 +177,33 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
171
177
|
dataset: Opik Dataset to optimize on
|
|
172
178
|
metric: Metric function to evaluate on
|
|
173
179
|
experiment_config: Optional configuration for the experiment
|
|
180
|
+
max_trials: Maximum number of different prompts to test (default: 10)
|
|
174
181
|
n_samples: Optional number of items to test in the dataset
|
|
175
182
|
auto_continue: Whether to auto-continue optimization
|
|
176
183
|
agent_class: Optional agent class to use
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
seed (int): Random seed for reproducibility (default: 42)
|
|
189
|
-
raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
|
|
190
|
-
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
|
184
|
+
reflection_minibatch_size: Size of reflection minibatches (default: 3)
|
|
185
|
+
candidate_selection_strategy: Strategy for candidate selection (default: "pareto")
|
|
186
|
+
skip_perfect_score: Skip candidates with perfect scores (default: True)
|
|
187
|
+
perfect_score: Score considered perfect (default: 1.0)
|
|
188
|
+
use_merge: Enable merge operations (default: False)
|
|
189
|
+
max_merge_invocations: Maximum merge invocations (default: 5)
|
|
190
|
+
run_dir: Directory for run outputs (default: None)
|
|
191
|
+
track_best_outputs: Track best outputs during optimization (default: False)
|
|
192
|
+
display_progress_bar: Display progress bar (default: False)
|
|
193
|
+
seed: Random seed for reproducibility (default: 42)
|
|
194
|
+
raise_on_exception: Raise exceptions instead of continuing (default: True)
|
|
191
195
|
|
|
192
196
|
Returns:
|
|
193
197
|
OptimizationResult: Result of the optimization
|
|
194
198
|
"""
|
|
195
199
|
# Use base class validation and setup methods
|
|
196
|
-
self.
|
|
197
|
-
|
|
198
|
-
# Extract GEPA-specific parameters from kwargs
|
|
199
|
-
max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
|
|
200
|
-
reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
|
|
201
|
-
candidate_selection_strategy: str = str(
|
|
202
|
-
kwargs.get("candidate_selection_strategy", "pareto")
|
|
203
|
-
)
|
|
204
|
-
skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
|
|
205
|
-
perfect_score: float = float(kwargs.get("perfect_score", 1.0))
|
|
206
|
-
use_merge: bool = kwargs.get("use_merge", False)
|
|
207
|
-
max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
|
|
208
|
-
run_dir: str | None = kwargs.get("run_dir", None)
|
|
209
|
-
track_best_outputs: bool = kwargs.get("track_best_outputs", False)
|
|
210
|
-
display_progress_bar: bool = kwargs.get("display_progress_bar", False)
|
|
211
|
-
seed: int = int(kwargs.get("seed", 42))
|
|
212
|
-
raise_on_exception: bool = kwargs.get("raise_on_exception", True)
|
|
213
|
-
kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
|
|
200
|
+
self._validate_optimization_inputs(prompt, dataset, metric)
|
|
214
201
|
|
|
215
202
|
prompt = prompt.copy()
|
|
216
|
-
if self.project_name:
|
|
217
|
-
prompt.project_name = self.project_name
|
|
218
203
|
if prompt.model is None:
|
|
219
204
|
prompt.model = self.model
|
|
220
205
|
if not prompt.model_kwargs:
|
|
221
|
-
prompt.model_kwargs = dict(self.
|
|
206
|
+
prompt.model_kwargs = dict(self.model_parameters)
|
|
222
207
|
|
|
223
208
|
seed_prompt_text = self._extract_system_text(prompt)
|
|
224
209
|
input_key, output_key = self._infer_dataset_keys(dataset)
|
|
@@ -227,12 +212,38 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
227
212
|
if n_samples and 0 < n_samples < len(items):
|
|
228
213
|
items = items[:n_samples]
|
|
229
214
|
|
|
215
|
+
# Calculate max_metric_calls from max_trials and effective samples
|
|
216
|
+
effective_n_samples = len(items)
|
|
217
|
+
max_metric_calls = max_trials * effective_n_samples
|
|
218
|
+
budget_limited_trials = (
|
|
219
|
+
max_metric_calls // effective_n_samples if effective_n_samples else 0
|
|
220
|
+
)
|
|
221
|
+
if reflection_minibatch_size > max_trials:
|
|
222
|
+
logger.warning(
|
|
223
|
+
"reflection_minibatch_size (%s) exceeds max_trials (%s); GEPA reflection will not run. "
|
|
224
|
+
"Increase max_trials or lower the minibatch.",
|
|
225
|
+
reflection_minibatch_size,
|
|
226
|
+
max_trials,
|
|
227
|
+
)
|
|
228
|
+
elif (
|
|
229
|
+
budget_limited_trials and reflection_minibatch_size > budget_limited_trials
|
|
230
|
+
):
|
|
231
|
+
logger.warning(
|
|
232
|
+
"reflection_minibatch_size (%s) exceeds the number of candidates allowed by the metric budget (%s). "
|
|
233
|
+
"Consider increasing max_trials or n_samples.",
|
|
234
|
+
reflection_minibatch_size,
|
|
235
|
+
budget_limited_trials,
|
|
236
|
+
)
|
|
237
|
+
|
|
230
238
|
data_insts = self._build_data_insts(items, input_key, output_key)
|
|
231
239
|
|
|
232
240
|
self._gepa_live_metric_calls = 0
|
|
233
241
|
|
|
234
242
|
base_prompt = prompt.copy()
|
|
235
243
|
|
|
244
|
+
# Set project name from parameter
|
|
245
|
+
self.project_name = project_name
|
|
246
|
+
|
|
236
247
|
opt_id: str | None = None
|
|
237
248
|
ds_id: str | None = getattr(dataset, "id", None)
|
|
238
249
|
|
|
@@ -249,8 +260,10 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
249
260
|
) as optimization:
|
|
250
261
|
try:
|
|
251
262
|
opt_id = optimization.id if optimization is not None else None
|
|
263
|
+
self.current_optimization_id = opt_id
|
|
252
264
|
except Exception:
|
|
253
265
|
opt_id = None
|
|
266
|
+
self.current_optimization_id = None
|
|
254
267
|
|
|
255
268
|
gepa_reporting.display_header(
|
|
256
269
|
algorithm=self.__class__.__name__,
|
|
@@ -266,11 +279,11 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
266
279
|
optimizer_config={
|
|
267
280
|
"optimizer": self.__class__.__name__,
|
|
268
281
|
"model": self.model,
|
|
269
|
-
"
|
|
282
|
+
"max_trials": max_trials,
|
|
283
|
+
"n_samples": n_samples or "all",
|
|
270
284
|
"max_metric_calls": max_metric_calls,
|
|
271
285
|
"reflection_minibatch_size": reflection_minibatch_size,
|
|
272
286
|
"candidate_selection_strategy": candidate_selection_strategy,
|
|
273
|
-
"n_samples": n_samples or "all",
|
|
274
287
|
},
|
|
275
288
|
verbose=self.verbose,
|
|
276
289
|
)
|
|
@@ -280,15 +293,6 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
280
293
|
initial_score = 0.0
|
|
281
294
|
with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
|
|
282
295
|
try:
|
|
283
|
-
baseline_suppress: ContextManager[Any] = nullcontext()
|
|
284
|
-
try:
|
|
285
|
-
from ..reporting_utils import (
|
|
286
|
-
suppress_opik_logs as _suppress_logs,
|
|
287
|
-
)
|
|
288
|
-
|
|
289
|
-
baseline_suppress = _suppress_logs()
|
|
290
|
-
except Exception:
|
|
291
|
-
pass
|
|
292
296
|
eval_kwargs = dict(
|
|
293
297
|
prompt=prompt,
|
|
294
298
|
dataset=dataset,
|
|
@@ -298,7 +302,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
298
302
|
extra_metadata={"phase": "baseline"},
|
|
299
303
|
verbose=0,
|
|
300
304
|
)
|
|
301
|
-
with
|
|
305
|
+
with suppress_opik_logs():
|
|
302
306
|
initial_score = float(
|
|
303
307
|
self._evaluate_prompt_logged(**eval_kwargs)
|
|
304
308
|
)
|
|
@@ -307,12 +311,11 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
307
311
|
logger.exception("Baseline evaluation failed")
|
|
308
312
|
|
|
309
313
|
adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
|
|
310
|
-
adapter_prompt.project_name = self.project_name
|
|
311
314
|
adapter_prompt.model = self.model
|
|
312
315
|
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
|
313
316
|
filtered_model_kwargs = {
|
|
314
317
|
k: v
|
|
315
|
-
for k, v in self.
|
|
318
|
+
for k, v in self.model_parameters.items()
|
|
316
319
|
if k not in ["num_prompts_per_round", "rounds"]
|
|
317
320
|
}
|
|
318
321
|
adapter_prompt.model_kwargs = filtered_model_kwargs
|
|
@@ -330,40 +333,52 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
330
333
|
except Exception as exc: # pragma: no cover
|
|
331
334
|
raise ImportError("gepa package is required for GepaOptimizer") from exc
|
|
332
335
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
336
|
+
# When using our Rich logger, disable GEPA's native progress bar to avoid conflicts
|
|
337
|
+
use_gepa_progress_bar = display_progress_bar if self.verbose == 0 else False
|
|
338
|
+
|
|
339
|
+
with gepa_reporting.start_gepa_optimization(
|
|
340
|
+
verbose=self.verbose, max_trials=max_trials
|
|
341
|
+
) as reporter:
|
|
342
|
+
# Create logger with progress bar support
|
|
343
|
+
logger_instance = gepa_reporting.RichGEPAOptimizerLogger(
|
|
344
|
+
self,
|
|
345
|
+
verbose=self.verbose,
|
|
346
|
+
progress=reporter.progress,
|
|
347
|
+
task_id=reporter.task_id,
|
|
348
|
+
max_trials=max_trials,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
kwargs_gepa: dict[str, Any] = {
|
|
352
|
+
"seed_candidate": {"system_prompt": seed_prompt_text},
|
|
353
|
+
"trainset": data_insts,
|
|
354
|
+
"valset": data_insts,
|
|
355
|
+
"adapter": adapter,
|
|
356
|
+
"task_lm": None,
|
|
357
|
+
"reflection_lm": self.model,
|
|
358
|
+
"candidate_selection_strategy": candidate_selection_strategy,
|
|
359
|
+
"skip_perfect_score": skip_perfect_score,
|
|
360
|
+
"reflection_minibatch_size": reflection_minibatch_size,
|
|
361
|
+
"perfect_score": perfect_score,
|
|
362
|
+
"use_merge": use_merge,
|
|
363
|
+
"max_merge_invocations": max_merge_invocations,
|
|
364
|
+
"max_metric_calls": max_metric_calls,
|
|
365
|
+
"run_dir": run_dir,
|
|
366
|
+
"track_best_outputs": track_best_outputs,
|
|
367
|
+
"display_progress_bar": use_gepa_progress_bar,
|
|
368
|
+
"seed": seed,
|
|
369
|
+
"raise_on_exception": raise_on_exception,
|
|
370
|
+
"logger": logger_instance,
|
|
371
|
+
}
|
|
356
372
|
|
|
357
|
-
optimize_sig = None
|
|
358
|
-
try:
|
|
359
|
-
optimize_sig = inspect.signature(gepa.optimize)
|
|
360
|
-
except Exception:
|
|
361
373
|
optimize_sig = None
|
|
374
|
+
try:
|
|
375
|
+
optimize_sig = inspect.signature(gepa.optimize)
|
|
376
|
+
except Exception:
|
|
377
|
+
optimize_sig = None
|
|
362
378
|
|
|
363
|
-
|
|
364
|
-
|
|
379
|
+
if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
|
|
380
|
+
kwargs_gepa["max_metric_calls"] = max_metric_calls
|
|
365
381
|
|
|
366
|
-
with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
|
|
367
382
|
gepa_result = gepa.optimize(**kwargs_gepa)
|
|
368
383
|
|
|
369
384
|
try:
|
|
@@ -381,110 +396,202 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
381
396
|
candidates: list[dict[str, str]] = getattr(gepa_result, "candidates", []) or []
|
|
382
397
|
val_scores: list[float] = list(getattr(gepa_result, "val_aggregate_scores", []))
|
|
383
398
|
|
|
399
|
+
indexed_candidates: list[tuple[int, dict[str, str]]] = list(
|
|
400
|
+
enumerate(candidates)
|
|
401
|
+
)
|
|
402
|
+
filtered_indexed_candidates = unique_ordered_by_key(
|
|
403
|
+
indexed_candidates,
|
|
404
|
+
key=lambda item: self._extract_system_text_from_candidate(
|
|
405
|
+
item[1], seed_prompt_text
|
|
406
|
+
).strip(),
|
|
407
|
+
)
|
|
408
|
+
filtered_candidates: list[dict[str, str]] = [
|
|
409
|
+
candidate for _, candidate in filtered_indexed_candidates
|
|
410
|
+
]
|
|
411
|
+
filtered_val_scores: list[float | None] = [
|
|
412
|
+
val_scores[idx] if idx < len(val_scores) else None
|
|
413
|
+
for idx, _ in filtered_indexed_candidates
|
|
414
|
+
]
|
|
415
|
+
|
|
384
416
|
rescored: list[float] = []
|
|
385
417
|
candidate_rows: list[dict[str, Any]] = []
|
|
386
418
|
history: list[dict[str, Any]] = []
|
|
387
419
|
|
|
388
|
-
for
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
420
|
+
# Import convert_tqdm_to_rich for suppressing display functions
|
|
421
|
+
from ..reporting_utils import convert_tqdm_to_rich
|
|
422
|
+
|
|
423
|
+
# Wrap rescoring to prevent OPIK messages and experiment link displays
|
|
424
|
+
with suppress_opik_logs():
|
|
425
|
+
with convert_tqdm_to_rich(verbose=0):
|
|
426
|
+
for idx, (original_idx, candidate) in enumerate(
|
|
427
|
+
filtered_indexed_candidates
|
|
428
|
+
):
|
|
429
|
+
candidate_prompt = self._extract_system_text_from_candidate(
|
|
430
|
+
candidate, seed_prompt_text
|
|
431
|
+
)
|
|
432
|
+
prompt_variant = self._apply_system_text(prompt, candidate_prompt)
|
|
433
|
+
prompt_variant.model = self.model
|
|
434
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
|
435
|
+
filtered_model_kwargs = {
|
|
436
|
+
k: v
|
|
437
|
+
for k, v in self.model_parameters.items()
|
|
438
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
|
439
|
+
}
|
|
440
|
+
prompt_variant.model_kwargs = filtered_model_kwargs
|
|
402
441
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
"system_prompt": candidate_prompt,
|
|
423
|
-
"gepa_score": val_scores[idx] if idx < len(val_scores) else None,
|
|
424
|
-
"opik_score": score,
|
|
425
|
-
"source": self.__class__.__name__,
|
|
426
|
-
}
|
|
427
|
-
)
|
|
428
|
-
history.append(
|
|
429
|
-
{
|
|
430
|
-
"iteration": idx + 1,
|
|
431
|
-
"prompt_candidate": candidate_prompt,
|
|
432
|
-
"scores": [
|
|
442
|
+
eval_kwargs = dict(
|
|
443
|
+
prompt=prompt_variant,
|
|
444
|
+
dataset=dataset,
|
|
445
|
+
metric=metric,
|
|
446
|
+
n_samples=n_samples,
|
|
447
|
+
optimization_id=opt_id,
|
|
448
|
+
extra_metadata={"phase": "rescoring", "candidate_index": idx},
|
|
449
|
+
verbose=0,
|
|
450
|
+
)
|
|
451
|
+
try:
|
|
452
|
+
score = float(self._evaluate_prompt_logged(**eval_kwargs))
|
|
453
|
+
except Exception:
|
|
454
|
+
logger.debug(
|
|
455
|
+
"Rescoring failed for candidate %s", idx, exc_info=True
|
|
456
|
+
)
|
|
457
|
+
score = 0.0
|
|
458
|
+
|
|
459
|
+
rescored.append(score)
|
|
460
|
+
candidate_rows.append(
|
|
433
461
|
{
|
|
434
|
-
"
|
|
435
|
-
"
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
462
|
+
"iteration": idx + 1,
|
|
463
|
+
"system_prompt": candidate_prompt,
|
|
464
|
+
"gepa_score": filtered_val_scores[idx],
|
|
465
|
+
"opik_score": score,
|
|
466
|
+
"source": self.__class__.__name__,
|
|
467
|
+
}
|
|
468
|
+
)
|
|
469
|
+
history.append(
|
|
470
|
+
{
|
|
471
|
+
"iteration": idx + 1,
|
|
472
|
+
"prompt_candidate": candidate_prompt,
|
|
473
|
+
"scores": [
|
|
474
|
+
{
|
|
475
|
+
"metric_name": f"GEPA-{metric.__name__}",
|
|
476
|
+
"score": filtered_val_scores[idx],
|
|
477
|
+
},
|
|
478
|
+
{"metric_name": metric.__name__, "score": score},
|
|
479
|
+
],
|
|
480
|
+
"metadata": {},
|
|
481
|
+
}
|
|
482
|
+
)
|
|
442
483
|
|
|
443
484
|
if rescored:
|
|
444
|
-
|
|
485
|
+
|
|
486
|
+
def _tie_break(idx: int) -> tuple[float, float, int]:
|
|
487
|
+
opik_score = rescored[idx]
|
|
488
|
+
gepa_score = filtered_val_scores[idx]
|
|
489
|
+
gepa_numeric = (
|
|
490
|
+
float(gepa_score)
|
|
491
|
+
if isinstance(gepa_score, (int, float))
|
|
492
|
+
else float("-inf")
|
|
493
|
+
)
|
|
494
|
+
return opik_score, gepa_numeric, idx
|
|
495
|
+
|
|
496
|
+
best_idx = max(range(len(rescored)), key=_tie_break)
|
|
445
497
|
best_score = rescored[best_idx]
|
|
446
498
|
else:
|
|
447
|
-
|
|
448
|
-
|
|
499
|
+
if filtered_indexed_candidates:
|
|
500
|
+
gepa_best_idx = getattr(gepa_result, "best_idx", 0) or 0
|
|
501
|
+
best_idx = next(
|
|
502
|
+
(
|
|
503
|
+
i
|
|
504
|
+
for i, (original_idx, _) in enumerate(
|
|
505
|
+
filtered_indexed_candidates
|
|
506
|
+
)
|
|
507
|
+
if original_idx == gepa_best_idx
|
|
508
|
+
),
|
|
509
|
+
0,
|
|
510
|
+
)
|
|
511
|
+
if filtered_val_scores and 0 <= best_idx < len(filtered_val_scores):
|
|
512
|
+
score_value = filtered_val_scores[best_idx]
|
|
513
|
+
best_score = float(score_value) if score_value is not None else 0.0
|
|
514
|
+
else:
|
|
515
|
+
best_score = float(initial_score)
|
|
516
|
+
else:
|
|
517
|
+
best_idx = 0
|
|
518
|
+
best_score = float(initial_score)
|
|
449
519
|
|
|
450
520
|
best_candidate = (
|
|
451
|
-
|
|
521
|
+
filtered_candidates[best_idx]
|
|
522
|
+
if filtered_candidates
|
|
523
|
+
else {"system_prompt": seed_prompt_text}
|
|
452
524
|
)
|
|
453
525
|
best_prompt_text = self._extract_system_text_from_candidate(
|
|
454
526
|
best_candidate, seed_prompt_text
|
|
455
527
|
)
|
|
456
528
|
|
|
457
529
|
final_prompt = self._apply_system_text(prompt, best_prompt_text)
|
|
458
|
-
final_prompt.project_name = self.project_name
|
|
459
530
|
final_prompt.model = self.model
|
|
460
531
|
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
|
461
532
|
filtered_model_kwargs = {
|
|
462
533
|
k: v
|
|
463
|
-
for k, v in self.
|
|
534
|
+
for k, v in self.model_parameters.items()
|
|
464
535
|
if k not in ["num_prompts_per_round", "rounds"]
|
|
465
536
|
}
|
|
466
537
|
final_prompt.model_kwargs = filtered_model_kwargs
|
|
467
538
|
|
|
468
|
-
|
|
469
|
-
prompt=final_prompt,
|
|
470
|
-
dataset=dataset,
|
|
471
|
-
metric=metric,
|
|
472
|
-
n_samples=n_samples,
|
|
473
|
-
optimization_id=opt_id,
|
|
474
|
-
extra_metadata={"phase": "final", "selected": True},
|
|
475
|
-
verbose=0,
|
|
476
|
-
)
|
|
477
|
-
suppress_logs: ContextManager[Any] = nullcontext()
|
|
478
|
-
try:
|
|
479
|
-
from ..reporting_utils import suppress_opik_logs as _suppress_logs
|
|
480
|
-
|
|
481
|
-
suppress_logs = _suppress_logs()
|
|
482
|
-
except Exception:
|
|
483
|
-
pass
|
|
539
|
+
final_eval_result: Any | None = None
|
|
484
540
|
|
|
485
|
-
with
|
|
541
|
+
with suppress_opik_logs():
|
|
486
542
|
try:
|
|
487
|
-
|
|
543
|
+
final_agent_cls = create_litellm_agent_class(
|
|
544
|
+
final_prompt, optimizer_ref=self
|
|
545
|
+
)
|
|
546
|
+
final_agent = final_agent_cls(final_prompt)
|
|
547
|
+
|
|
548
|
+
def final_llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
|
549
|
+
messages = final_prompt.get_messages(dataset_item)
|
|
550
|
+
raw = final_agent.invoke(messages)
|
|
551
|
+
if self.current_optimization_id:
|
|
552
|
+
opik_context.update_current_trace(
|
|
553
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
554
|
+
)
|
|
555
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
|
|
556
|
+
|
|
557
|
+
configuration_updates = self._drop_none(
|
|
558
|
+
{"gepa": {"phase": "final", "selected": True}}
|
|
559
|
+
)
|
|
560
|
+
final_experiment_config = self._prepare_experiment_config(
|
|
561
|
+
prompt=final_prompt,
|
|
562
|
+
dataset=dataset,
|
|
563
|
+
metric=metric,
|
|
564
|
+
experiment_config=experiment_config,
|
|
565
|
+
configuration_updates=configuration_updates,
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
metric_class = _create_metric_class(metric)
|
|
569
|
+
|
|
570
|
+
if opt_id:
|
|
571
|
+
final_eval_result = opik_evaluator.evaluate_optimization_trial(
|
|
572
|
+
optimization_id=opt_id,
|
|
573
|
+
dataset=dataset,
|
|
574
|
+
task=final_llm_task,
|
|
575
|
+
project_name=final_experiment_config.get("project_name"),
|
|
576
|
+
dataset_item_ids=None,
|
|
577
|
+
scoring_metrics=[metric_class],
|
|
578
|
+
task_threads=self.n_threads,
|
|
579
|
+
nb_samples=n_samples,
|
|
580
|
+
experiment_config=final_experiment_config,
|
|
581
|
+
verbose=0,
|
|
582
|
+
)
|
|
583
|
+
else:
|
|
584
|
+
final_eval_result = opik_evaluator.evaluate(
|
|
585
|
+
dataset=dataset,
|
|
586
|
+
task=final_llm_task,
|
|
587
|
+
project_name=final_experiment_config.get("project_name"),
|
|
588
|
+
dataset_item_ids=None,
|
|
589
|
+
scoring_metrics=[metric_class],
|
|
590
|
+
task_threads=self.n_threads,
|
|
591
|
+
nb_samples=n_samples,
|
|
592
|
+
experiment_config=final_experiment_config,
|
|
593
|
+
verbose=0,
|
|
594
|
+
)
|
|
488
595
|
except Exception:
|
|
489
596
|
logger.debug("Final evaluation failed", exc_info=True)
|
|
490
597
|
|
|
@@ -514,28 +621,55 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
514
621
|
except Exception:
|
|
515
622
|
logger.debug("Per-item diagnostics failed", exc_info=True)
|
|
516
623
|
|
|
624
|
+
trial_info: dict[str, Any] | None = None
|
|
625
|
+
if final_eval_result is not None:
|
|
626
|
+
experiment_name = getattr(final_eval_result, "experiment_name", None)
|
|
627
|
+
experiment_url = getattr(final_eval_result, "experiment_url", None)
|
|
628
|
+
trial_ids = []
|
|
629
|
+
try:
|
|
630
|
+
trial_ids = sorted(
|
|
631
|
+
{
|
|
632
|
+
str(test_result.trial_id)
|
|
633
|
+
for test_result in getattr(
|
|
634
|
+
final_eval_result, "test_results", []
|
|
635
|
+
)
|
|
636
|
+
if getattr(test_result, "trial_id", None) is not None
|
|
637
|
+
}
|
|
638
|
+
)
|
|
639
|
+
except Exception:
|
|
640
|
+
logger.debug("Failed to extract trial IDs", exc_info=True)
|
|
641
|
+
|
|
642
|
+
trial_info = {
|
|
643
|
+
"experiment_name": experiment_name,
|
|
644
|
+
"experiment_url": experiment_url,
|
|
645
|
+
"trial_ids": trial_ids,
|
|
646
|
+
}
|
|
647
|
+
|
|
517
648
|
details: dict[str, Any] = {
|
|
518
649
|
"model": self.model,
|
|
519
|
-
"temperature": self.
|
|
650
|
+
"temperature": self.model_parameters.get("temperature"),
|
|
520
651
|
"optimizer": self.__class__.__name__,
|
|
521
|
-
"num_candidates":
|
|
652
|
+
"num_candidates": len(filtered_candidates),
|
|
522
653
|
"total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
|
|
523
654
|
"parents": getattr(gepa_result, "parents", None),
|
|
524
|
-
"val_scores":
|
|
655
|
+
"val_scores": filtered_val_scores,
|
|
525
656
|
"opik_rescored_scores": rescored,
|
|
526
657
|
"candidate_summary": candidate_rows,
|
|
527
658
|
"best_candidate_iteration": (
|
|
528
659
|
candidate_rows[best_idx]["iteration"] if candidate_rows else 0
|
|
529
660
|
),
|
|
530
|
-
"selected_candidate_index": best_idx,
|
|
661
|
+
"selected_candidate_index": best_idx if filtered_candidates else None,
|
|
531
662
|
"selected_candidate_gepa_score": (
|
|
532
|
-
|
|
663
|
+
filtered_val_scores[best_idx]
|
|
664
|
+
if filtered_val_scores and 0 <= best_idx < len(filtered_val_scores)
|
|
665
|
+
else None
|
|
533
666
|
),
|
|
534
667
|
"selected_candidate_opik_score": best_score,
|
|
535
668
|
"gepa_live_metric_used": True,
|
|
536
669
|
"gepa_live_metric_call_count": self._gepa_live_metric_calls,
|
|
537
670
|
"selected_candidate_item_scores": per_item_scores,
|
|
538
671
|
"dataset_item_ids": [item.get("id") for item in items],
|
|
672
|
+
"selected_candidate_trial_info": trial_info,
|
|
539
673
|
}
|
|
540
674
|
if experiment_config:
|
|
541
675
|
details["experiment"] = experiment_config
|
|
@@ -547,7 +681,10 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
547
681
|
candidate_rows, verbose=self.verbose
|
|
548
682
|
)
|
|
549
683
|
gepa_reporting.display_selected_candidate(
|
|
550
|
-
best_prompt_text,
|
|
684
|
+
best_prompt_text,
|
|
685
|
+
best_score,
|
|
686
|
+
verbose=self.verbose,
|
|
687
|
+
trial_info=trial_info,
|
|
551
688
|
)
|
|
552
689
|
|
|
553
690
|
if logger.isEnabledFor(logging.DEBUG):
|
|
@@ -618,7 +755,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
618
755
|
if prompt.model is None:
|
|
619
756
|
prompt.model = self.model
|
|
620
757
|
if prompt.model_kwargs is None:
|
|
621
|
-
prompt.model_kwargs = self.
|
|
758
|
+
prompt.model_kwargs = self.model_parameters
|
|
622
759
|
|
|
623
760
|
agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
|
|
624
761
|
self.agent_class = agent_class
|
|
@@ -627,6 +764,13 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
627
764
|
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
|
628
765
|
messages = prompt.get_messages(dataset_item)
|
|
629
766
|
raw = agent.invoke(messages)
|
|
767
|
+
|
|
768
|
+
# Add tags to trace for optimization tracking
|
|
769
|
+
if self.current_optimization_id:
|
|
770
|
+
opik_context.update_current_trace(
|
|
771
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
772
|
+
)
|
|
773
|
+
|
|
630
774
|
return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
|
|
631
775
|
|
|
632
776
|
configuration_updates = self._drop_none({"gepa": extra_metadata})
|
|
@@ -643,7 +787,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
643
787
|
dataset_item_ids=dataset_item_ids,
|
|
644
788
|
metric=metric,
|
|
645
789
|
evaluated_task=llm_task,
|
|
646
|
-
num_threads=self.
|
|
790
|
+
num_threads=self.n_threads,
|
|
647
791
|
project_name=experiment_config.get("project_name"),
|
|
648
792
|
experiment_config=experiment_config,
|
|
649
793
|
optimization_id=optimization_id,
|