opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +4 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +402 -28
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +154 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +30 -23
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +21 -16
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +22 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/utils/colbert.py +236 -0
- opik_optimizer/{utils.py → utils/core.py} +160 -33
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- opik_optimizer-2.0.0.dist-info/METADATA +345 -0
- opik_optimizer-2.0.0.dist-info/RECORD +74 -0
- opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.6.dist-info/METADATA +0 -181
- opik_optimizer-1.0.6.dist-info/RECORD +0 -50
- opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,653 @@
|
|
1
|
+
import logging
|
2
|
+
from contextlib import nullcontext
|
3
|
+
from typing import Any, ContextManager
|
4
|
+
from collections.abc import Callable
|
5
|
+
|
6
|
+
import opik
|
7
|
+
from opik import Dataset
|
8
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
9
|
+
|
10
|
+
from ..base_optimizer import BaseOptimizer
|
11
|
+
from ..optimization_config import chat_prompt, mappers
|
12
|
+
from ..optimization_result import OptimizationResult
|
13
|
+
from ..optimizable_agent import OptimizableAgent
|
14
|
+
from ..utils import (
|
15
|
+
optimization_context,
|
16
|
+
create_litellm_agent_class,
|
17
|
+
disable_experiment_reporting,
|
18
|
+
enable_experiment_reporting,
|
19
|
+
)
|
20
|
+
from .. import task_evaluator
|
21
|
+
from . import reporting as gepa_reporting
|
22
|
+
from .adapter import OpikDataInst, OpikGEPAAdapter
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class GepaOptimizer(BaseOptimizer):
|
28
|
+
"""Minimal integration against the upstream GEPA engine."""
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
model: str,
|
33
|
+
project_name: str | None = None,
|
34
|
+
reflection_model: str | None = None,
|
35
|
+
verbose: int = 1,
|
36
|
+
seed: int = 42,
|
37
|
+
**model_kwargs: Any,
|
38
|
+
) -> None:
|
39
|
+
# Validate required parameters
|
40
|
+
if model is None:
|
41
|
+
raise ValueError("model parameter is required and cannot be None")
|
42
|
+
if not isinstance(model, str):
|
43
|
+
raise ValueError(f"model must be a string, got {type(model).__name__}")
|
44
|
+
if not model.strip():
|
45
|
+
raise ValueError("model cannot be empty or whitespace-only")
|
46
|
+
|
47
|
+
# Validate optional parameters
|
48
|
+
if project_name is not None and not isinstance(project_name, str):
|
49
|
+
raise ValueError(
|
50
|
+
f"project_name must be a string or None, got {type(project_name).__name__}"
|
51
|
+
)
|
52
|
+
|
53
|
+
if reflection_model is not None and not isinstance(reflection_model, str):
|
54
|
+
raise ValueError(
|
55
|
+
f"reflection_model must be a string or None, got {type(reflection_model).__name__}"
|
56
|
+
)
|
57
|
+
|
58
|
+
if not isinstance(verbose, int):
|
59
|
+
raise ValueError(
|
60
|
+
f"verbose must be an integer, got {type(verbose).__name__}"
|
61
|
+
)
|
62
|
+
if verbose < 0:
|
63
|
+
raise ValueError("verbose must be non-negative")
|
64
|
+
|
65
|
+
if not isinstance(seed, int):
|
66
|
+
raise ValueError(f"seed must be an integer, got {type(seed).__name__}")
|
67
|
+
|
68
|
+
super().__init__(model=model, verbose=verbose, seed=seed, **model_kwargs)
|
69
|
+
self.project_name = project_name
|
70
|
+
self.reflection_model = reflection_model or model
|
71
|
+
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
72
|
+
self._gepa_live_metric_calls = 0
|
73
|
+
self._adapter = None # Will be set during optimization
|
74
|
+
|
75
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
76
|
+
return {
|
77
|
+
"project_name": self.project_name,
|
78
|
+
"reflection_model": self.reflection_model,
|
79
|
+
}
|
80
|
+
|
81
|
+
def cleanup(self) -> None:
|
82
|
+
"""
|
83
|
+
Clean up GEPA-specific resources.
|
84
|
+
"""
|
85
|
+
# Call parent cleanup
|
86
|
+
super().cleanup()
|
87
|
+
|
88
|
+
# Clear GEPA-specific resources
|
89
|
+
self._adapter = None
|
90
|
+
self._gepa_live_metric_calls = 0
|
91
|
+
|
92
|
+
logger.debug("Cleaned up GEPA-specific resources")
|
93
|
+
|
94
|
+
# ------------------------------------------------------------------
|
95
|
+
# Helpers
|
96
|
+
# ------------------------------------------------------------------
|
97
|
+
|
98
|
+
def _build_data_insts(
|
99
|
+
self,
|
100
|
+
dataset_items: list[dict[str, Any]],
|
101
|
+
input_key: str,
|
102
|
+
output_key: str,
|
103
|
+
) -> list[OpikDataInst]:
|
104
|
+
data_insts: list[OpikDataInst] = []
|
105
|
+
for item in dataset_items:
|
106
|
+
additional_context: dict[str, str] = {}
|
107
|
+
metadata = item.get("metadata") or {}
|
108
|
+
if isinstance(metadata, dict):
|
109
|
+
context_value = metadata.get("context")
|
110
|
+
if isinstance(context_value, str):
|
111
|
+
additional_context["context"] = context_value
|
112
|
+
if "context" in item and isinstance(item["context"], str):
|
113
|
+
additional_context.setdefault("context", item["context"])
|
114
|
+
|
115
|
+
data_insts.append(
|
116
|
+
OpikDataInst(
|
117
|
+
input_text=str(item.get(input_key, "")),
|
118
|
+
answer=str(item.get(output_key, "")),
|
119
|
+
additional_context=additional_context,
|
120
|
+
opik_item=item,
|
121
|
+
)
|
122
|
+
)
|
123
|
+
return data_insts
|
124
|
+
|
125
|
+
def _apply_system_text(
|
126
|
+
self, prompt_obj: chat_prompt.ChatPrompt, system_text: str
|
127
|
+
) -> chat_prompt.ChatPrompt:
|
128
|
+
updated = prompt_obj.copy()
|
129
|
+
if updated.messages is not None:
|
130
|
+
messages = updated.get_messages()
|
131
|
+
if messages and messages[0].get("role") == "system":
|
132
|
+
messages[0]["content"] = system_text
|
133
|
+
else:
|
134
|
+
messages.insert(0, {"role": "system", "content": system_text})
|
135
|
+
updated.set_messages(messages)
|
136
|
+
else:
|
137
|
+
updated.system = system_text
|
138
|
+
return updated
|
139
|
+
|
140
|
+
def _infer_dataset_keys(self, dataset: Dataset) -> tuple[str, str]:
|
141
|
+
items = dataset.get_items(1)
|
142
|
+
if not items:
|
143
|
+
return "text", "label"
|
144
|
+
sample = items[0]
|
145
|
+
output_candidates = ["label", "answer", "output", "expected_output"]
|
146
|
+
output_key = next((k for k in output_candidates if k in sample), "label")
|
147
|
+
excluded = {output_key, "id", "metadata"}
|
148
|
+
input_key = next((k for k in sample.keys() if k not in excluded), "text")
|
149
|
+
return input_key, output_key
|
150
|
+
|
151
|
+
# ------------------------------------------------------------------
|
152
|
+
# Base optimizer overrides
|
153
|
+
# ------------------------------------------------------------------
|
154
|
+
|
155
|
+
def optimize_prompt(
|
156
|
+
self,
|
157
|
+
prompt: chat_prompt.ChatPrompt,
|
158
|
+
dataset: Dataset,
|
159
|
+
metric: Callable,
|
160
|
+
experiment_config: dict | None = None,
|
161
|
+
n_samples: int | None = None,
|
162
|
+
auto_continue: bool = False,
|
163
|
+
agent_class: type[OptimizableAgent] | None = None,
|
164
|
+
**kwargs: Any,
|
165
|
+
) -> OptimizationResult:
|
166
|
+
"""
|
167
|
+
Optimize a prompt using GEPA (Genetic-Pareto) algorithm.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
prompt: The prompt to optimize
|
171
|
+
dataset: Opik Dataset to optimize on
|
172
|
+
metric: Metric function to evaluate on
|
173
|
+
experiment_config: Optional configuration for the experiment
|
174
|
+
n_samples: Optional number of items to test in the dataset
|
175
|
+
auto_continue: Whether to auto-continue optimization
|
176
|
+
agent_class: Optional agent class to use
|
177
|
+
**kwargs: GEPA-specific parameters:
|
178
|
+
max_metric_calls (int | None): Maximum number of metric evaluations (default: 30)
|
179
|
+
reflection_minibatch_size (int): Size of reflection minibatches (default: 3)
|
180
|
+
candidate_selection_strategy (str): Strategy for candidate selection (default: "pareto")
|
181
|
+
skip_perfect_score (bool): Skip candidates with perfect scores (default: True)
|
182
|
+
perfect_score (float): Score considered perfect (default: 1.0)
|
183
|
+
use_merge (bool): Enable merge operations (default: False)
|
184
|
+
max_merge_invocations (int): Maximum merge invocations (default: 5)
|
185
|
+
run_dir (str | None): Directory for run outputs (default: None)
|
186
|
+
track_best_outputs (bool): Track best outputs during optimization (default: False)
|
187
|
+
display_progress_bar (bool): Display progress bar (default: False)
|
188
|
+
seed (int): Random seed for reproducibility (default: 42)
|
189
|
+
raise_on_exception (bool): Raise exceptions instead of continuing (default: True)
|
190
|
+
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
OptimizationResult: Result of the optimization
|
194
|
+
"""
|
195
|
+
# Use base class validation and setup methods
|
196
|
+
self.validate_optimization_inputs(prompt, dataset, metric)
|
197
|
+
|
198
|
+
# Extract GEPA-specific parameters from kwargs
|
199
|
+
max_metric_calls: int | None = kwargs.get("max_metric_calls", 30)
|
200
|
+
reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
|
201
|
+
candidate_selection_strategy: str = str(
|
202
|
+
kwargs.get("candidate_selection_strategy", "pareto")
|
203
|
+
)
|
204
|
+
skip_perfect_score: bool = kwargs.get("skip_perfect_score", True)
|
205
|
+
perfect_score: float = float(kwargs.get("perfect_score", 1.0))
|
206
|
+
use_merge: bool = kwargs.get("use_merge", False)
|
207
|
+
max_merge_invocations: int = int(kwargs.get("max_merge_invocations", 5))
|
208
|
+
run_dir: str | None = kwargs.get("run_dir", None)
|
209
|
+
track_best_outputs: bool = kwargs.get("track_best_outputs", False)
|
210
|
+
display_progress_bar: bool = kwargs.get("display_progress_bar", False)
|
211
|
+
seed: int = int(kwargs.get("seed", 42))
|
212
|
+
raise_on_exception: bool = kwargs.get("raise_on_exception", True)
|
213
|
+
kwargs.pop("mcp_config", None) # Added for MCP support (for future use)
|
214
|
+
|
215
|
+
prompt = prompt.copy()
|
216
|
+
if self.project_name:
|
217
|
+
prompt.project_name = self.project_name
|
218
|
+
if prompt.model is None:
|
219
|
+
prompt.model = self.model
|
220
|
+
if not prompt.model_kwargs:
|
221
|
+
prompt.model_kwargs = dict(self.model_kwargs)
|
222
|
+
|
223
|
+
seed_prompt_text = self._extract_system_text(prompt)
|
224
|
+
input_key, output_key = self._infer_dataset_keys(dataset)
|
225
|
+
|
226
|
+
items = dataset.get_items()
|
227
|
+
if n_samples and 0 < n_samples < len(items):
|
228
|
+
items = items[:n_samples]
|
229
|
+
|
230
|
+
data_insts = self._build_data_insts(items, input_key, output_key)
|
231
|
+
|
232
|
+
self._gepa_live_metric_calls = 0
|
233
|
+
|
234
|
+
base_prompt = prompt.copy()
|
235
|
+
|
236
|
+
opt_id: str | None = None
|
237
|
+
ds_id: str | None = getattr(dataset, "id", None)
|
238
|
+
|
239
|
+
opik_client = opik.Opik(project_name=self.project_name)
|
240
|
+
|
241
|
+
disable_experiment_reporting()
|
242
|
+
|
243
|
+
try:
|
244
|
+
with optimization_context(
|
245
|
+
client=opik_client,
|
246
|
+
dataset_name=dataset.name,
|
247
|
+
objective_name=metric.__name__,
|
248
|
+
metadata={"optimizer": self.__class__.__name__},
|
249
|
+
) as optimization:
|
250
|
+
try:
|
251
|
+
opt_id = optimization.id if optimization is not None else None
|
252
|
+
except Exception:
|
253
|
+
opt_id = None
|
254
|
+
|
255
|
+
gepa_reporting.display_header(
|
256
|
+
algorithm="GEPA",
|
257
|
+
optimization_id=opt_id,
|
258
|
+
dataset_id=getattr(dataset, "id", None),
|
259
|
+
verbose=self.verbose,
|
260
|
+
)
|
261
|
+
|
262
|
+
from ..reporting_utils import display_configuration as _display_config
|
263
|
+
|
264
|
+
_display_config(
|
265
|
+
messages=prompt.get_messages(),
|
266
|
+
optimizer_config={
|
267
|
+
"optimizer": "GEPA",
|
268
|
+
"model": self.model,
|
269
|
+
"reflection_model": self.reflection_model,
|
270
|
+
"max_metric_calls": max_metric_calls,
|
271
|
+
"reflection_minibatch_size": reflection_minibatch_size,
|
272
|
+
"candidate_selection_strategy": candidate_selection_strategy,
|
273
|
+
"n_samples": n_samples or "all",
|
274
|
+
},
|
275
|
+
verbose=self.verbose,
|
276
|
+
)
|
277
|
+
|
278
|
+
# Baseline evaluation
|
279
|
+
initial_prompt_messages = prompt.get_messages()
|
280
|
+
initial_score = 0.0
|
281
|
+
with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
|
282
|
+
try:
|
283
|
+
baseline_suppress: ContextManager[Any] = nullcontext()
|
284
|
+
try:
|
285
|
+
from ..reporting_utils import (
|
286
|
+
suppress_opik_logs as _suppress_logs,
|
287
|
+
)
|
288
|
+
|
289
|
+
baseline_suppress = _suppress_logs()
|
290
|
+
except Exception:
|
291
|
+
pass
|
292
|
+
eval_kwargs = dict(
|
293
|
+
prompt=prompt,
|
294
|
+
dataset=dataset,
|
295
|
+
metric=metric,
|
296
|
+
n_samples=n_samples,
|
297
|
+
optimization_id=opt_id,
|
298
|
+
extra_metadata={"phase": "baseline"},
|
299
|
+
verbose=0,
|
300
|
+
)
|
301
|
+
with baseline_suppress:
|
302
|
+
initial_score = float(
|
303
|
+
self._evaluate_prompt_logged(**eval_kwargs)
|
304
|
+
)
|
305
|
+
baseline.set_score(initial_score)
|
306
|
+
except Exception:
|
307
|
+
logger.exception("Baseline evaluation failed")
|
308
|
+
|
309
|
+
adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
|
310
|
+
adapter_prompt.project_name = self.project_name
|
311
|
+
adapter_prompt.model = self.model
|
312
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
313
|
+
filtered_model_kwargs = {
|
314
|
+
k: v
|
315
|
+
for k, v in self.model_kwargs.items()
|
316
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
317
|
+
}
|
318
|
+
adapter_prompt.model_kwargs = filtered_model_kwargs
|
319
|
+
|
320
|
+
adapter = OpikGEPAAdapter(
|
321
|
+
base_prompt=adapter_prompt,
|
322
|
+
optimizer=self,
|
323
|
+
metric=metric,
|
324
|
+
system_fallback=seed_prompt_text,
|
325
|
+
)
|
326
|
+
|
327
|
+
try:
|
328
|
+
import gepa
|
329
|
+
import inspect
|
330
|
+
except Exception as exc: # pragma: no cover
|
331
|
+
raise ImportError("gepa package is required for GepaOptimizer") from exc
|
332
|
+
|
333
|
+
kwargs_gepa: dict[str, Any] = {
|
334
|
+
"seed_candidate": {"system_prompt": seed_prompt_text},
|
335
|
+
"trainset": data_insts,
|
336
|
+
"valset": data_insts,
|
337
|
+
"adapter": adapter,
|
338
|
+
"task_lm": None,
|
339
|
+
"reflection_lm": self.reflection_model,
|
340
|
+
"candidate_selection_strategy": candidate_selection_strategy,
|
341
|
+
"skip_perfect_score": skip_perfect_score,
|
342
|
+
"reflection_minibatch_size": reflection_minibatch_size,
|
343
|
+
"perfect_score": perfect_score,
|
344
|
+
"use_merge": use_merge,
|
345
|
+
"max_merge_invocations": max_merge_invocations,
|
346
|
+
"max_metric_calls": max_metric_calls,
|
347
|
+
"run_dir": run_dir,
|
348
|
+
"track_best_outputs": track_best_outputs,
|
349
|
+
"display_progress_bar": display_progress_bar,
|
350
|
+
"seed": seed,
|
351
|
+
"raise_on_exception": raise_on_exception,
|
352
|
+
"logger": gepa_reporting.RichGEPAOptimizerLogger(
|
353
|
+
self, verbose=self.verbose
|
354
|
+
),
|
355
|
+
}
|
356
|
+
|
357
|
+
optimize_sig = None
|
358
|
+
try:
|
359
|
+
optimize_sig = inspect.signature(gepa.optimize)
|
360
|
+
except Exception:
|
361
|
+
optimize_sig = None
|
362
|
+
|
363
|
+
if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
|
364
|
+
kwargs_gepa["max_metric_calls"] = max_metric_calls
|
365
|
+
|
366
|
+
with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
|
367
|
+
gepa_result = gepa.optimize(**kwargs_gepa)
|
368
|
+
|
369
|
+
try:
|
370
|
+
opt_id = optimization.id if optimization is not None else None
|
371
|
+
except Exception:
|
372
|
+
opt_id = None
|
373
|
+
|
374
|
+
finally:
|
375
|
+
enable_experiment_reporting()
|
376
|
+
|
377
|
+
# ------------------------------------------------------------------
|
378
|
+
# Rescoring & result assembly
|
379
|
+
# ------------------------------------------------------------------
|
380
|
+
|
381
|
+
candidates: list[dict[str, str]] = getattr(gepa_result, "candidates", []) or []
|
382
|
+
val_scores: list[float] = list(getattr(gepa_result, "val_aggregate_scores", []))
|
383
|
+
|
384
|
+
rescored: list[float] = []
|
385
|
+
candidate_rows: list[dict[str, Any]] = []
|
386
|
+
history: list[dict[str, Any]] = []
|
387
|
+
|
388
|
+
for idx, candidate in enumerate(candidates):
|
389
|
+
candidate_prompt = self._extract_system_text_from_candidate(
|
390
|
+
candidate, seed_prompt_text
|
391
|
+
)
|
392
|
+
prompt_variant = self._apply_system_text(prompt, candidate_prompt)
|
393
|
+
prompt_variant.project_name = self.project_name
|
394
|
+
prompt_variant.model = self.model
|
395
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
396
|
+
filtered_model_kwargs = {
|
397
|
+
k: v
|
398
|
+
for k, v in self.model_kwargs.items()
|
399
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
400
|
+
}
|
401
|
+
prompt_variant.model_kwargs = filtered_model_kwargs
|
402
|
+
|
403
|
+
eval_kwargs = dict(
|
404
|
+
prompt=prompt_variant,
|
405
|
+
dataset=dataset,
|
406
|
+
metric=metric,
|
407
|
+
n_samples=n_samples,
|
408
|
+
optimization_id=opt_id,
|
409
|
+
extra_metadata={"phase": "rescoring", "candidate_index": idx},
|
410
|
+
verbose=0,
|
411
|
+
)
|
412
|
+
try:
|
413
|
+
score = float(self._evaluate_prompt_logged(**eval_kwargs))
|
414
|
+
except Exception:
|
415
|
+
logger.debug("Rescoring failed for candidate %s", idx, exc_info=True)
|
416
|
+
score = 0.0
|
417
|
+
|
418
|
+
rescored.append(score)
|
419
|
+
candidate_rows.append(
|
420
|
+
{
|
421
|
+
"iteration": idx + 1,
|
422
|
+
"system_prompt": candidate_prompt,
|
423
|
+
"gepa_score": val_scores[idx] if idx < len(val_scores) else None,
|
424
|
+
"opik_score": score,
|
425
|
+
"source": "GEPA",
|
426
|
+
}
|
427
|
+
)
|
428
|
+
history.append(
|
429
|
+
{
|
430
|
+
"iteration": idx + 1,
|
431
|
+
"prompt_candidate": candidate_prompt,
|
432
|
+
"scores": [
|
433
|
+
{
|
434
|
+
"metric_name": f"GEPA-{metric.__name__}",
|
435
|
+
"score": val_scores[idx] if idx < len(val_scores) else None,
|
436
|
+
},
|
437
|
+
{"metric_name": metric.__name__, "score": score},
|
438
|
+
],
|
439
|
+
"metadata": {},
|
440
|
+
}
|
441
|
+
)
|
442
|
+
|
443
|
+
if rescored:
|
444
|
+
best_idx = max(range(len(rescored)), key=lambda i: rescored[i])
|
445
|
+
best_score = rescored[best_idx]
|
446
|
+
else:
|
447
|
+
best_idx = getattr(gepa_result, "best_idx", 0) or 0
|
448
|
+
best_score = float(val_scores[best_idx]) if val_scores else 0.0
|
449
|
+
|
450
|
+
best_candidate = (
|
451
|
+
candidates[best_idx] if candidates else {"system_prompt": seed_prompt_text}
|
452
|
+
)
|
453
|
+
best_prompt_text = self._extract_system_text_from_candidate(
|
454
|
+
best_candidate, seed_prompt_text
|
455
|
+
)
|
456
|
+
|
457
|
+
final_prompt = self._apply_system_text(prompt, best_prompt_text)
|
458
|
+
final_prompt.project_name = self.project_name
|
459
|
+
final_prompt.model = self.model
|
460
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
461
|
+
filtered_model_kwargs = {
|
462
|
+
k: v
|
463
|
+
for k, v in self.model_kwargs.items()
|
464
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
465
|
+
}
|
466
|
+
final_prompt.model_kwargs = filtered_model_kwargs
|
467
|
+
|
468
|
+
final_eval_kwargs = dict(
|
469
|
+
prompt=final_prompt,
|
470
|
+
dataset=dataset,
|
471
|
+
metric=metric,
|
472
|
+
n_samples=n_samples,
|
473
|
+
optimization_id=opt_id,
|
474
|
+
extra_metadata={"phase": "final", "selected": True},
|
475
|
+
verbose=0,
|
476
|
+
)
|
477
|
+
suppress_logs: ContextManager[Any] = nullcontext()
|
478
|
+
try:
|
479
|
+
from ..reporting_utils import suppress_opik_logs as _suppress_logs
|
480
|
+
|
481
|
+
suppress_logs = _suppress_logs()
|
482
|
+
except Exception:
|
483
|
+
pass
|
484
|
+
|
485
|
+
with suppress_logs:
|
486
|
+
try:
|
487
|
+
self._evaluate_prompt_logged(**final_eval_kwargs)
|
488
|
+
except Exception:
|
489
|
+
logger.debug("Final evaluation failed", exc_info=True)
|
490
|
+
|
491
|
+
per_item_scores: list[dict[str, Any]] = []
|
492
|
+
try:
|
493
|
+
analysis_prompt = final_prompt.copy()
|
494
|
+
agent_cls = create_litellm_agent_class(analysis_prompt, optimizer_ref=self)
|
495
|
+
agent = agent_cls(analysis_prompt)
|
496
|
+
for item in items:
|
497
|
+
messages = analysis_prompt.get_messages(item)
|
498
|
+
output_text = agent.invoke(messages).strip()
|
499
|
+
metric_result = metric(item, output_text)
|
500
|
+
if hasattr(metric_result, "value"):
|
501
|
+
score_val = float(metric_result.value)
|
502
|
+
elif hasattr(metric_result, "score"):
|
503
|
+
score_val = float(metric_result.score)
|
504
|
+
else:
|
505
|
+
score_val = float(metric_result)
|
506
|
+
per_item_scores.append(
|
507
|
+
{
|
508
|
+
"dataset_item_id": item.get("id"),
|
509
|
+
"score": score_val,
|
510
|
+
"answer": item.get(output_key),
|
511
|
+
"output": output_text,
|
512
|
+
}
|
513
|
+
)
|
514
|
+
except Exception:
|
515
|
+
logger.debug("Per-item diagnostics failed", exc_info=True)
|
516
|
+
|
517
|
+
details: dict[str, Any] = {
|
518
|
+
"model": self.model,
|
519
|
+
"temperature": self.model_kwargs.get("temperature"),
|
520
|
+
"optimizer": self.__class__.__name__,
|
521
|
+
"num_candidates": getattr(gepa_result, "num_candidates", None),
|
522
|
+
"total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
|
523
|
+
"parents": getattr(gepa_result, "parents", None),
|
524
|
+
"val_scores": val_scores,
|
525
|
+
"opik_rescored_scores": rescored,
|
526
|
+
"candidate_summary": candidate_rows,
|
527
|
+
"best_candidate_iteration": (
|
528
|
+
candidate_rows[best_idx]["iteration"] if candidate_rows else 0
|
529
|
+
),
|
530
|
+
"selected_candidate_index": best_idx,
|
531
|
+
"selected_candidate_gepa_score": (
|
532
|
+
val_scores[best_idx] if best_idx < len(val_scores) else None
|
533
|
+
),
|
534
|
+
"selected_candidate_opik_score": best_score,
|
535
|
+
"gepa_live_metric_used": True,
|
536
|
+
"gepa_live_metric_call_count": self._gepa_live_metric_calls,
|
537
|
+
"selected_candidate_item_scores": per_item_scores,
|
538
|
+
"dataset_item_ids": [item.get("id") for item in items],
|
539
|
+
}
|
540
|
+
if experiment_config:
|
541
|
+
details["experiment"] = experiment_config
|
542
|
+
|
543
|
+
final_messages = final_prompt.get_messages()
|
544
|
+
|
545
|
+
if self.verbose >= 1:
|
546
|
+
gepa_reporting.display_candidate_scores(
|
547
|
+
candidate_rows, verbose=self.verbose
|
548
|
+
)
|
549
|
+
gepa_reporting.display_selected_candidate(
|
550
|
+
best_prompt_text, best_score, verbose=self.verbose
|
551
|
+
)
|
552
|
+
|
553
|
+
if logger.isEnabledFor(logging.DEBUG):
|
554
|
+
for idx, row in enumerate(candidate_rows):
|
555
|
+
logger.debug(
|
556
|
+
"candidate=%s source=%s gepa=%s opik=%s",
|
557
|
+
idx,
|
558
|
+
row.get("source"),
|
559
|
+
row.get("gepa_score"),
|
560
|
+
row.get("opik_score"),
|
561
|
+
)
|
562
|
+
logger.debug(
|
563
|
+
"selected candidate idx=%s gepa=%s opik=%.4f",
|
564
|
+
best_idx,
|
565
|
+
details.get("selected_candidate_gepa_score"),
|
566
|
+
best_score,
|
567
|
+
)
|
568
|
+
|
569
|
+
return OptimizationResult(
|
570
|
+
optimizer=self.__class__.__name__,
|
571
|
+
prompt=final_messages,
|
572
|
+
score=best_score,
|
573
|
+
metric_name=metric.__name__,
|
574
|
+
optimization_id=opt_id,
|
575
|
+
dataset_id=ds_id,
|
576
|
+
initial_prompt=initial_prompt_messages,
|
577
|
+
initial_score=initial_score,
|
578
|
+
details=details,
|
579
|
+
history=history,
|
580
|
+
llm_calls=None,
|
581
|
+
)
|
582
|
+
|
583
|
+
# ------------------------------------------------------------------
|
584
|
+
# Helpers used by BaseOptimizer.evaluate_prompt
|
585
|
+
# ------------------------------------------------------------------
|
586
|
+
|
587
|
+
def _extract_system_text(self, prompt: chat_prompt.ChatPrompt) -> str:
|
588
|
+
messages = prompt.get_messages()
|
589
|
+
for message in messages:
|
590
|
+
if message.get("role") == "system":
|
591
|
+
return str(message.get("content", "")).strip()
|
592
|
+
for message in messages:
|
593
|
+
if message.get("role") == "user":
|
594
|
+
return f"You are a helpful assistant. Respond to: {message.get('content', '')}"
|
595
|
+
return "You are a helpful assistant."
|
596
|
+
|
597
|
+
def _extract_system_text_from_candidate(
|
598
|
+
self, candidate: dict[str, Any], fallback: str
|
599
|
+
) -> str:
|
600
|
+
for key in ("system_prompt", "system", "prompt"):
|
601
|
+
value = candidate.get(key)
|
602
|
+
if isinstance(value, str) and value.strip():
|
603
|
+
return value
|
604
|
+
return fallback
|
605
|
+
|
606
|
+
def _evaluate_prompt_logged(
|
607
|
+
self,
|
608
|
+
prompt: chat_prompt.ChatPrompt,
|
609
|
+
dataset: Dataset,
|
610
|
+
metric: Callable[[dict[str, Any], str], ScoreResult],
|
611
|
+
n_samples: int | None = None,
|
612
|
+
dataset_item_ids: list[str] | None = None,
|
613
|
+
experiment_config: dict[str, Any] | None = None,
|
614
|
+
optimization_id: str | None = None,
|
615
|
+
extra_metadata: dict[str, Any] | None = None,
|
616
|
+
verbose: int = 1,
|
617
|
+
) -> float:
|
618
|
+
if prompt.model is None:
|
619
|
+
prompt.model = self.model
|
620
|
+
if prompt.model_kwargs is None:
|
621
|
+
prompt.model_kwargs = self.model_kwargs
|
622
|
+
|
623
|
+
agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
|
624
|
+
self.agent_class = agent_class
|
625
|
+
agent = agent_class(prompt)
|
626
|
+
|
627
|
+
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
628
|
+
messages = prompt.get_messages(dataset_item)
|
629
|
+
raw = agent.invoke(messages)
|
630
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
|
631
|
+
|
632
|
+
configuration_updates = self._drop_none({"gepa": extra_metadata})
|
633
|
+
experiment_config = self._prepare_experiment_config(
|
634
|
+
prompt=prompt,
|
635
|
+
dataset=dataset,
|
636
|
+
metric=metric,
|
637
|
+
experiment_config=experiment_config,
|
638
|
+
configuration_updates=configuration_updates,
|
639
|
+
)
|
640
|
+
|
641
|
+
score = task_evaluator.evaluate(
|
642
|
+
dataset=dataset,
|
643
|
+
dataset_item_ids=dataset_item_ids,
|
644
|
+
metric=metric,
|
645
|
+
evaluated_task=llm_task,
|
646
|
+
num_threads=self.num_threads,
|
647
|
+
project_name=experiment_config.get("project_name"),
|
648
|
+
experiment_config=experiment_config,
|
649
|
+
optimization_id=optimization_id,
|
650
|
+
n_samples=n_samples,
|
651
|
+
verbose=verbose,
|
652
|
+
)
|
653
|
+
return score
|