opik-optimizer 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +0 -2
- opik_optimizer/base_optimizer.py +314 -145
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
- opik_optimizer/gepa_optimizer/reporting.py +164 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +42 -15
- opik_optimizer/utils/core.py +16 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +34 -35
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ from typing import Any, ContextManager
|
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
|
|
6
6
|
import opik
|
|
7
|
-
from opik import Dataset
|
|
7
|
+
from opik import Dataset, opik_context
|
|
8
8
|
from opik.evaluation.metrics.score_result import ScoreResult
|
|
9
9
|
|
|
10
10
|
from ..base_optimizer import BaseOptimizer
|
|
@@ -17,6 +17,7 @@ from ..utils import (
|
|
|
17
17
|
disable_experiment_reporting,
|
|
18
18
|
enable_experiment_reporting,
|
|
19
19
|
)
|
|
20
|
+
from ..reporting_utils import suppress_opik_logs
|
|
20
21
|
from .. import task_evaluator
|
|
21
22
|
from . import reporting as gepa_reporting
|
|
22
23
|
from .adapter import OpikDataInst, OpikGEPAAdapter
|
|
@@ -25,16 +26,30 @@ logger = logging.getLogger(__name__)
|
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
class GepaOptimizer(BaseOptimizer):
|
|
28
|
-
"""
|
|
29
|
+
"""
|
|
30
|
+
The GEPA (Genetic-Pareto) Optimizer uses a genetic algorithm with Pareto optimization
|
|
31
|
+
to improve prompts while balancing multiple objectives.
|
|
32
|
+
|
|
33
|
+
This algorithm is well-suited for complex optimization tasks where you want to find
|
|
34
|
+
prompts that balance trade-offs between different quality metrics.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
model: LiteLLM model name for the optimization algorithm
|
|
38
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
39
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p.
|
|
40
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
41
|
+
n_threads: Number of parallel threads for evaluation
|
|
42
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
43
|
+
seed: Random seed for reproducibility
|
|
44
|
+
"""
|
|
29
45
|
|
|
30
46
|
def __init__(
|
|
31
47
|
self,
|
|
32
|
-
model: str,
|
|
33
|
-
|
|
34
|
-
|
|
48
|
+
model: str = "gpt-4o",
|
|
49
|
+
model_parameters: dict[str, Any] | None = None,
|
|
50
|
+
n_threads: int = 6,
|
|
35
51
|
verbose: int = 1,
|
|
36
52
|
seed: int = 42,
|
|
37
|
-
**model_kwargs: Any,
|
|
38
53
|
) -> None:
|
|
39
54
|
# Validate required parameters
|
|
40
55
|
if model is None:
|
|
@@ -45,16 +60,6 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
45
60
|
raise ValueError("model cannot be empty or whitespace-only")
|
|
46
61
|
|
|
47
62
|
# Validate optional parameters
|
|
48
|
-
if project_name is not None and not isinstance(project_name, str):
|
|
49
|
-
raise ValueError(
|
|
50
|
-
f"project_name must be a string or None, got {type(project_name).__name__}"
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
if reflection_model is not None and not isinstance(reflection_model, str):
|
|
54
|
-
raise ValueError(
|
|
55
|
-
f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
|
|
56
|
-
)
|
|
57
|
-
|
|
58
63
|
if not isinstance(verbose, int):
|
|
59
64
|
raise ValueError(
|
|
60
65
|
f"verbose must be an integer, got {type(verbose).__name__}"
|
|
@@ -65,32 +70,19 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
65
70
|
if not isinstance(seed, int):
|
|
66
71
|
raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
|
|
67
72
|
|
|
68
|
-
super().__init__(
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
self.
|
|
73
|
+
super().__init__(
|
|
74
|
+
model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
|
|
75
|
+
)
|
|
76
|
+
self.n_threads = n_threads
|
|
72
77
|
self._gepa_live_metric_calls = 0
|
|
73
78
|
self._adapter = None # Will be set during optimization
|
|
74
79
|
|
|
75
80
|
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
76
81
|
return {
|
|
77
|
-
"
|
|
78
|
-
"
|
|
82
|
+
"model": self.model,
|
|
83
|
+
"n_threads": self.n_threads,
|
|
79
84
|
}
|
|
80
85
|
|
|
81
|
-
def cleanup(self) -> None:
|
|
82
|
-
"""
|
|
83
|
-
Clean up GEPA-specific resources.
|
|
84
|
-
"""
|
|
85
|
-
# Call parent cleanup
|
|
86
|
-
super().cleanup()
|
|
87
|
-
|
|
88
|
-
# Clear GEPA-specific resources
|
|
89
|
-
self._adapter = None
|
|
90
|
-
self._gepa_live_metric_calls = 0
|
|
91
|
-
|
|
92
|
-
logger.debug("Cleaned up GEPA-specific resources")
|
|
93
|
-
|
|
94
86
|
# ------------------------------------------------------------------
|
|
95
87
|
# Helpers
|
|
96
88
|
# ------------------------------------------------------------------
|
|
@@ -161,7 +153,19 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
161
153
|
n_samples: int | None = None,
|
|
162
154
|
auto_continue: bool = False,
|
|
163
155
|
agent_class: type[OptimizableAgent] | None = None,
|
|
164
|
-
|
|
156
|
+
project_name: str = "Optimization",
|
|
157
|
+
max_trials: int = 10,
|
|
158
|
+
reflection_minibatch_size: int = 3,
|
|
159
|
+
candidate_selection_strategy: str = "pareto",
|
|
160
|
+
skip_perfect_score: bool = True,
|
|
161
|
+
perfect_score: float = 1.0,
|
|
162
|
+
use_merge: bool = False,
|
|
163
|
+
max_merge_invocations: int = 5,
|
|
164
|
+
run_dir: str | None = None,
|
|
165
|
+
track_best_outputs: bool = False,
|
|
166
|
+
display_progress_bar: bool = False,
|
|
167
|
+
seed: int = 42,
|
|
168
|
+
raise_on_exception: bool = True,
|
|
165
169
|
) -> OptimizationResult:
|
|
166
170
|
"""
|
|
167
171
|
Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
|
|
@@ -171,54 +175,33 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
171
175
|
dataset: Opik Dataset to optimize on
|
|
172
176
|
metric: Metric function to evaluate on
|
|
173
177
|
experiment_config: Optional configuration for the experiment
|
|
178
|
+
max_trials: Maximum number of different prompts to test (default: 10)
|
|
174
179
|
n_samples: Optional number of items to test in the dataset
|
|
175
180
|
auto_continue: Whether to auto-continue optimization
|
|
176
181
|
agent_class: Optional agent class to use
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
seed (int): Random seed for reproducibility (default: 42)
|
|
189
|
-
raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
|
|
190
|
-
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
|
182
|
+
reflection_minibatch_size: Size of reflection minibatches (default: 3)
|
|
183
|
+
candidate_selection_strategy: Strategy for candidate selection (default: "pareto")
|
|
184
|
+
skip_perfect_score: Skip candidates with perfect scores (default: True)
|
|
185
|
+
perfect_score: Score considered perfect (default: 1.0)
|
|
186
|
+
use_merge: Enable merge operations (default: False)
|
|
187
|
+
max_merge_invocations: Maximum merge invocations (default: 5)
|
|
188
|
+
run_dir: Directory for run outputs (default: None)
|
|
189
|
+
track_best_outputs: Track best outputs during optimization (default: False)
|
|
190
|
+
display_progress_bar: Display progress bar (default: False)
|
|
191
|
+
seed: Random seed for reproducibility (default: 42)
|
|
192
|
+
raise_on_exception: Raise exceptions instead of continuing (default: True)
|
|
191
193
|
|
|
192
194
|
Returns:
|
|
193
195
|
OptimizationResult: Result of the optimization
|
|
194
196
|
"""
|
|
195
197
|
# Use base class validation and setup methods
|
|
196
|
-
self.
|
|
197
|
-
|
|
198
|
-
# Extract GEPA-specific parameters from kwargs
|
|
199
|
-
max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
|
|
200
|
-
reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
|
|
201
|
-
candidate_selection_strategy: str = str(
|
|
202
|
-
kwargs.get("candidate_selection_strategy", "pareto")
|
|
203
|
-
)
|
|
204
|
-
skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
|
|
205
|
-
perfect_score: float = float(kwargs.get("perfect_score", 1.0))
|
|
206
|
-
use_merge: bool = kwargs.get("use_merge", False)
|
|
207
|
-
max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
|
|
208
|
-
run_dir: str | None = kwargs.get("run_dir", None)
|
|
209
|
-
track_best_outputs: bool = kwargs.get("track_best_outputs", False)
|
|
210
|
-
display_progress_bar: bool = kwargs.get("display_progress_bar", False)
|
|
211
|
-
seed: int = int(kwargs.get("seed", 42))
|
|
212
|
-
raise_on_exception: bool = kwargs.get("raise_on_exception", True)
|
|
213
|
-
kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
|
|
198
|
+
self._validate_optimization_inputs(prompt, dataset, metric)
|
|
214
199
|
|
|
215
200
|
prompt = prompt.copy()
|
|
216
|
-
if self.project_name:
|
|
217
|
-
prompt.project_name = self.project_name
|
|
218
201
|
if prompt.model is None:
|
|
219
202
|
prompt.model = self.model
|
|
220
203
|
if not prompt.model_kwargs:
|
|
221
|
-
prompt.model_kwargs = dict(self.
|
|
204
|
+
prompt.model_kwargs = dict(self.model_parameters)
|
|
222
205
|
|
|
223
206
|
seed_prompt_text = self._extract_system_text(prompt)
|
|
224
207
|
input_key, output_key = self._infer_dataset_keys(dataset)
|
|
@@ -227,12 +210,19 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
227
210
|
if n_samples and 0 < n_samples < len(items):
|
|
228
211
|
items = items[:n_samples]
|
|
229
212
|
|
|
213
|
+
# Calculate max_metric_calls from max_trials and effective samples
|
|
214
|
+
effective_n_samples = len(items)
|
|
215
|
+
max_metric_calls = max_trials * effective_n_samples
|
|
216
|
+
|
|
230
217
|
data_insts = self._build_data_insts(items, input_key, output_key)
|
|
231
218
|
|
|
232
219
|
self._gepa_live_metric_calls = 0
|
|
233
220
|
|
|
234
221
|
base_prompt = prompt.copy()
|
|
235
222
|
|
|
223
|
+
# Set project name from parameter
|
|
224
|
+
self.project_name = project_name
|
|
225
|
+
|
|
236
226
|
opt_id: str | None = None
|
|
237
227
|
ds_id: str | None = getattr(dataset, "id", None)
|
|
238
228
|
|
|
@@ -249,8 +239,10 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
249
239
|
) as optimization:
|
|
250
240
|
try:
|
|
251
241
|
opt_id = optimization.id if optimization is not None else None
|
|
242
|
+
self.current_optimization_id = opt_id
|
|
252
243
|
except Exception:
|
|
253
244
|
opt_id = None
|
|
245
|
+
self.current_optimization_id = None
|
|
254
246
|
|
|
255
247
|
gepa_reporting.display_header(
|
|
256
248
|
algorithm=self.__class__.__name__,
|
|
@@ -266,11 +258,11 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
266
258
|
optimizer_config={
|
|
267
259
|
"optimizer": self.__class__.__name__,
|
|
268
260
|
"model": self.model,
|
|
269
|
-
"
|
|
261
|
+
"max_trials": max_trials,
|
|
262
|
+
"n_samples": n_samples or "all",
|
|
270
263
|
"max_metric_calls": max_metric_calls,
|
|
271
264
|
"reflection_minibatch_size": reflection_minibatch_size,
|
|
272
265
|
"candidate_selection_strategy": candidate_selection_strategy,
|
|
273
|
-
"n_samples": n_samples or "all",
|
|
274
266
|
},
|
|
275
267
|
verbose=self.verbose,
|
|
276
268
|
)
|
|
@@ -280,15 +272,6 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
280
272
|
initial_score = 0.0
|
|
281
273
|
with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
|
|
282
274
|
try:
|
|
283
|
-
baseline_suppress: ContextManager[Any] = nullcontext()
|
|
284
|
-
try:
|
|
285
|
-
from ..reporting_utils import (
|
|
286
|
-
suppress_opik_logs as _suppress_logs,
|
|
287
|
-
)
|
|
288
|
-
|
|
289
|
-
baseline_suppress = _suppress_logs()
|
|
290
|
-
except Exception:
|
|
291
|
-
pass
|
|
292
275
|
eval_kwargs = dict(
|
|
293
276
|
prompt=prompt,
|
|
294
277
|
dataset=dataset,
|
|
@@ -298,7 +281,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
298
281
|
extra_metadata={"phase": "baseline"},
|
|
299
282
|
verbose=0,
|
|
300
283
|
)
|
|
301
|
-
with
|
|
284
|
+
with suppress_opik_logs():
|
|
302
285
|
initial_score = float(
|
|
303
286
|
self._evaluate_prompt_logged(**eval_kwargs)
|
|
304
287
|
)
|
|
@@ -307,12 +290,11 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
307
290
|
logger.exception("Baseline evaluation failed")
|
|
308
291
|
|
|
309
292
|
adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
|
|
310
|
-
adapter_prompt.project_name = self.project_name
|
|
311
293
|
adapter_prompt.model = self.model
|
|
312
294
|
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
|
313
295
|
filtered_model_kwargs = {
|
|
314
296
|
k: v
|
|
315
|
-
for k, v in self.
|
|
297
|
+
for k, v in self.model_parameters.items()
|
|
316
298
|
if k not in ["num_prompts_per_round", "rounds"]
|
|
317
299
|
}
|
|
318
300
|
adapter_prompt.model_kwargs = filtered_model_kwargs
|
|
@@ -330,40 +312,52 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
330
312
|
except Exception as exc: # pragma: no cover
|
|
331
313
|
raise ImportError("gepa package is required for GepaOptimizer") from exc
|
|
332
314
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
315
|
+
# When using our Rich logger, disable GEPA's native progress bar to avoid conflicts
|
|
316
|
+
use_gepa_progress_bar = display_progress_bar if self.verbose == 0 else False
|
|
317
|
+
|
|
318
|
+
with gepa_reporting.start_gepa_optimization(
|
|
319
|
+
verbose=self.verbose, max_trials=max_trials
|
|
320
|
+
) as reporter:
|
|
321
|
+
# Create logger with progress bar support
|
|
322
|
+
logger_instance = gepa_reporting.RichGEPAOptimizerLogger(
|
|
323
|
+
self,
|
|
324
|
+
verbose=self.verbose,
|
|
325
|
+
progress=reporter.progress,
|
|
326
|
+
task_id=reporter.task_id,
|
|
327
|
+
max_trials=max_trials,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
kwargs_gepa: dict[str, Any] = {
|
|
331
|
+
"seed_candidate": {"system_prompt": seed_prompt_text},
|
|
332
|
+
"trainset": data_insts,
|
|
333
|
+
"valset": data_insts,
|
|
334
|
+
"adapter": adapter,
|
|
335
|
+
"task_lm": None,
|
|
336
|
+
"reflection_lm": self.model,
|
|
337
|
+
"candidate_selection_strategy": candidate_selection_strategy,
|
|
338
|
+
"skip_perfect_score": skip_perfect_score,
|
|
339
|
+
"reflection_minibatch_size": reflection_minibatch_size,
|
|
340
|
+
"perfect_score": perfect_score,
|
|
341
|
+
"use_merge": use_merge,
|
|
342
|
+
"max_merge_invocations": max_merge_invocations,
|
|
343
|
+
"max_metric_calls": max_metric_calls,
|
|
344
|
+
"run_dir": run_dir,
|
|
345
|
+
"track_best_outputs": track_best_outputs,
|
|
346
|
+
"display_progress_bar": use_gepa_progress_bar,
|
|
347
|
+
"seed": seed,
|
|
348
|
+
"raise_on_exception": raise_on_exception,
|
|
349
|
+
"logger": logger_instance,
|
|
350
|
+
}
|
|
356
351
|
|
|
357
|
-
optimize_sig = None
|
|
358
|
-
try:
|
|
359
|
-
optimize_sig = inspect.signature(gepa.optimize)
|
|
360
|
-
except Exception:
|
|
361
352
|
optimize_sig = None
|
|
353
|
+
try:
|
|
354
|
+
optimize_sig = inspect.signature(gepa.optimize)
|
|
355
|
+
except Exception:
|
|
356
|
+
optimize_sig = None
|
|
362
357
|
|
|
363
|
-
|
|
364
|
-
|
|
358
|
+
if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
|
|
359
|
+
kwargs_gepa["max_metric_calls"] = max_metric_calls
|
|
365
360
|
|
|
366
|
-
with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
|
|
367
361
|
gepa_result = gepa.optimize(**kwargs_gepa)
|
|
368
362
|
|
|
369
363
|
try:
|
|
@@ -385,60 +379,71 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
385
379
|
candidate_rows: list[dict[str, Any]] = []
|
|
386
380
|
history: list[dict[str, Any]] = []
|
|
387
381
|
|
|
388
|
-
for
|
|
389
|
-
|
|
390
|
-
candidate, seed_prompt_text
|
|
391
|
-
)
|
|
392
|
-
prompt_variant = self._apply_system_text(prompt, candidate_prompt)
|
|
393
|
-
prompt_variant.project_name = self.project_name
|
|
394
|
-
prompt_variant.model = self.model
|
|
395
|
-
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
|
396
|
-
filtered_model_kwargs = {
|
|
397
|
-
k: v
|
|
398
|
-
for k, v in self.model_kwargs.items()
|
|
399
|
-
if k not in ["num_prompts_per_round", "rounds"]
|
|
400
|
-
}
|
|
401
|
-
prompt_variant.model_kwargs = filtered_model_kwargs
|
|
382
|
+
# Import convert_tqdm_to_rich for suppressing display functions
|
|
383
|
+
from ..reporting_utils import convert_tqdm_to_rich
|
|
402
384
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
385
|
+
# Wrap rescoring to prevent OPIK messages and experiment link displays
|
|
386
|
+
with suppress_opik_logs():
|
|
387
|
+
with convert_tqdm_to_rich(verbose=0):
|
|
388
|
+
for idx, candidate in enumerate(candidates):
|
|
389
|
+
candidate_prompt = self._extract_system_text_from_candidate(
|
|
390
|
+
candidate, seed_prompt_text
|
|
391
|
+
)
|
|
392
|
+
prompt_variant = self._apply_system_text(prompt, candidate_prompt)
|
|
393
|
+
prompt_variant.model = self.model
|
|
394
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
|
395
|
+
filtered_model_kwargs = {
|
|
396
|
+
k: v
|
|
397
|
+
for k, v in self.model_parameters.items()
|
|
398
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
|
399
|
+
}
|
|
400
|
+
prompt_variant.model_kwargs = filtered_model_kwargs
|
|
401
|
+
|
|
402
|
+
eval_kwargs = dict(
|
|
403
|
+
prompt=prompt_variant,
|
|
404
|
+
dataset=dataset,
|
|
405
|
+
metric=metric,
|
|
406
|
+
n_samples=n_samples,
|
|
407
|
+
optimization_id=opt_id,
|
|
408
|
+
extra_metadata={"phase": "rescoring", "candidate_index": idx},
|
|
409
|
+
verbose=0,
|
|
410
|
+
)
|
|
411
|
+
try:
|
|
412
|
+
score = float(self._evaluate_prompt_logged(**eval_kwargs))
|
|
413
|
+
except Exception:
|
|
414
|
+
logger.debug(
|
|
415
|
+
"Rescoring failed for candidate %s", idx, exc_info=True
|
|
416
|
+
)
|
|
417
|
+
score = 0.0
|
|
418
|
+
|
|
419
|
+
rescored.append(score)
|
|
420
|
+
candidate_rows.append(
|
|
433
421
|
{
|
|
434
|
-
"
|
|
435
|
-
"
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
422
|
+
"iteration": idx + 1,
|
|
423
|
+
"system_prompt": candidate_prompt,
|
|
424
|
+
"gepa_score": val_scores[idx]
|
|
425
|
+
if idx < len(val_scores)
|
|
426
|
+
else None,
|
|
427
|
+
"opik_score": score,
|
|
428
|
+
"source": self.__class__.__name__,
|
|
429
|
+
}
|
|
430
|
+
)
|
|
431
|
+
history.append(
|
|
432
|
+
{
|
|
433
|
+
"iteration": idx + 1,
|
|
434
|
+
"prompt_candidate": candidate_prompt,
|
|
435
|
+
"scores": [
|
|
436
|
+
{
|
|
437
|
+
"metric_name": f"GEPA-{metric.__name__}",
|
|
438
|
+
"score": val_scores[idx]
|
|
439
|
+
if idx < len(val_scores)
|
|
440
|
+
else None,
|
|
441
|
+
},
|
|
442
|
+
{"metric_name": metric.__name__, "score": score},
|
|
443
|
+
],
|
|
444
|
+
"metadata": {},
|
|
445
|
+
}
|
|
446
|
+
)
|
|
442
447
|
|
|
443
448
|
if rescored:
|
|
444
449
|
best_idx = max(range(len(rescored)), key=lambda i: rescored[i])
|
|
@@ -455,12 +460,11 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
455
460
|
)
|
|
456
461
|
|
|
457
462
|
final_prompt = self._apply_system_text(prompt, best_prompt_text)
|
|
458
|
-
final_prompt.project_name = self.project_name
|
|
459
463
|
final_prompt.model = self.model
|
|
460
464
|
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
|
461
465
|
filtered_model_kwargs = {
|
|
462
466
|
k: v
|
|
463
|
-
for k, v in self.
|
|
467
|
+
for k, v in self.model_parameters.items()
|
|
464
468
|
if k not in ["num_prompts_per_round", "rounds"]
|
|
465
469
|
}
|
|
466
470
|
final_prompt.model_kwargs = filtered_model_kwargs
|
|
@@ -516,7 +520,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
516
520
|
|
|
517
521
|
details: dict[str, Any] = {
|
|
518
522
|
"model": self.model,
|
|
519
|
-
"temperature": self.
|
|
523
|
+
"temperature": self.model_parameters.get("temperature"),
|
|
520
524
|
"optimizer": self.__class__.__name__,
|
|
521
525
|
"num_candidates": getattr(gepa_result, "num_candidates", None),
|
|
522
526
|
"total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
|
|
@@ -618,7 +622,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
618
622
|
if prompt.model is None:
|
|
619
623
|
prompt.model = self.model
|
|
620
624
|
if prompt.model_kwargs is None:
|
|
621
|
-
prompt.model_kwargs = self.
|
|
625
|
+
prompt.model_kwargs = self.model_parameters
|
|
622
626
|
|
|
623
627
|
agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
|
|
624
628
|
self.agent_class = agent_class
|
|
@@ -627,6 +631,13 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
627
631
|
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
|
628
632
|
messages = prompt.get_messages(dataset_item)
|
|
629
633
|
raw = agent.invoke(messages)
|
|
634
|
+
|
|
635
|
+
# Add tags to trace for optimization tracking
|
|
636
|
+
if self.current_optimization_id:
|
|
637
|
+
opik_context.update_current_trace(
|
|
638
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
639
|
+
)
|
|
640
|
+
|
|
630
641
|
return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
|
|
631
642
|
|
|
632
643
|
configuration_updates = self._drop_none({"gepa": extra_metadata})
|
|
@@ -643,7 +654,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
643
654
|
dataset_item_ids=dataset_item_ids,
|
|
644
655
|
metric=metric,
|
|
645
656
|
evaluated_task=llm_task,
|
|
646
|
-
num_threads=self.
|
|
657
|
+
num_threads=self.n_threads,
|
|
647
658
|
project_name=experiment_config.get("project_name"),
|
|
648
659
|
experiment_config=experiment_config,
|
|
649
660
|
optimization_id=optimization_id,
|