opik-optimizer 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +28 -11
- opik_optimizer/colbert.py +236 -0
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +152 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +20 -20
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +16 -16
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +21 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/{utils.py → utils/core.py} +111 -26
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
- opik_optimizer-1.1.0.dist-info/RECORD +73 -0
- opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.6.dist-info/RECORD +0 -50
- opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,556 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from contextlib import nullcontext
|
5
|
+
from typing import Any, ContextManager
|
6
|
+
from collections.abc import Callable
|
7
|
+
|
8
|
+
import opik
|
9
|
+
from opik import Dataset
|
10
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
11
|
+
|
12
|
+
from ..base_optimizer import BaseOptimizer
|
13
|
+
from ..optimization_config import chat_prompt, mappers
|
14
|
+
from ..optimization_result import OptimizationResult
|
15
|
+
from ..utils import optimization_context, create_litellm_agent_class
|
16
|
+
from ..logging_config import setup_logging as _setup_logging
|
17
|
+
from .. import task_evaluator
|
18
|
+
from . import reporting as gepa_reporting
|
19
|
+
from .adapter import OpikDataInst, OpikGEPAAdapter
|
20
|
+
|
21
|
+
|
22
|
+
_setup_logging()
|
23
|
+
LOGGER = logging.getLogger("opik_optimizer.gepa.optimizer")
|
24
|
+
|
25
|
+
|
26
|
+
class GepaOptimizer(BaseOptimizer):
|
27
|
+
"""Minimal integration against the upstream GEPA engine."""
|
28
|
+
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
model: str,
|
32
|
+
project_name: str | None = None,
|
33
|
+
reflection_model: str | None = None,
|
34
|
+
verbose: int = 1,
|
35
|
+
**model_kwargs: Any,
|
36
|
+
) -> None:
|
37
|
+
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
38
|
+
self.project_name = project_name
|
39
|
+
self.reflection_model = reflection_model or model
|
40
|
+
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
41
|
+
self.seed = self.model_kwargs.pop("seed", 42)
|
42
|
+
self._gepa_live_metric_calls = 0
|
43
|
+
|
44
|
+
# ------------------------------------------------------------------
|
45
|
+
# Helpers
|
46
|
+
# ------------------------------------------------------------------
|
47
|
+
|
48
|
+
def _build_data_insts(
|
49
|
+
self,
|
50
|
+
dataset_items: list[dict[str, Any]],
|
51
|
+
input_key: str,
|
52
|
+
output_key: str,
|
53
|
+
) -> list[OpikDataInst]:
|
54
|
+
data_insts: list[OpikDataInst] = []
|
55
|
+
for item in dataset_items:
|
56
|
+
additional_context: dict[str, str] = {}
|
57
|
+
metadata = item.get("metadata") or {}
|
58
|
+
if isinstance(metadata, dict):
|
59
|
+
context_value = metadata.get("context")
|
60
|
+
if isinstance(context_value, str):
|
61
|
+
additional_context["context"] = context_value
|
62
|
+
if "context" in item and isinstance(item["context"], str):
|
63
|
+
additional_context.setdefault("context", item["context"])
|
64
|
+
|
65
|
+
data_insts.append(
|
66
|
+
OpikDataInst(
|
67
|
+
input_text=str(item.get(input_key, "")),
|
68
|
+
answer=str(item.get(output_key, "")),
|
69
|
+
additional_context=additional_context,
|
70
|
+
opik_item=item,
|
71
|
+
)
|
72
|
+
)
|
73
|
+
return data_insts
|
74
|
+
|
75
|
+
def _apply_system_text(
|
76
|
+
self, prompt_obj: chat_prompt.ChatPrompt, system_text: str
|
77
|
+
) -> chat_prompt.ChatPrompt:
|
78
|
+
updated = prompt_obj.copy()
|
79
|
+
if updated.messages is not None:
|
80
|
+
messages = updated.get_messages()
|
81
|
+
if messages and messages[0].get("role") == "system":
|
82
|
+
messages[0]["content"] = system_text
|
83
|
+
else:
|
84
|
+
messages.insert(0, {"role": "system", "content": system_text})
|
85
|
+
updated.set_messages(messages)
|
86
|
+
else:
|
87
|
+
updated.system = system_text
|
88
|
+
return updated
|
89
|
+
|
90
|
+
def _infer_dataset_keys(self, dataset: Dataset) -> tuple[str, str]:
|
91
|
+
items = dataset.get_items(1)
|
92
|
+
if not items:
|
93
|
+
return "text", "label"
|
94
|
+
sample = items[0]
|
95
|
+
output_candidates = ["label", "answer", "output", "expected_output"]
|
96
|
+
output_key = next((k for k in output_candidates if k in sample), "label")
|
97
|
+
excluded = {output_key, "id", "metadata"}
|
98
|
+
input_key = next((k for k in sample.keys() if k not in excluded), "text")
|
99
|
+
return input_key, output_key
|
100
|
+
|
101
|
+
# ------------------------------------------------------------------
|
102
|
+
# Base optimizer overrides
|
103
|
+
# ------------------------------------------------------------------
|
104
|
+
|
105
|
+
def optimize_prompt(
|
106
|
+
self,
|
107
|
+
prompt: chat_prompt.ChatPrompt,
|
108
|
+
dataset: str | Dataset,
|
109
|
+
metric: Callable[[dict[str, Any], str], ScoreResult],
|
110
|
+
experiment_config: dict[str, Any] | None = None,
|
111
|
+
**kwargs: Any,
|
112
|
+
) -> OptimizationResult:
|
113
|
+
if isinstance(dataset, str):
|
114
|
+
client = opik.Opik(project_name=self.project_name)
|
115
|
+
dataset = client.get_dataset(dataset)
|
116
|
+
|
117
|
+
max_metric_calls: int = int(kwargs.get("max_metric_calls", 30))
|
118
|
+
reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
|
119
|
+
candidate_selection_strategy: str = str(
|
120
|
+
kwargs.get("candidate_selection_strategy", "pareto")
|
121
|
+
)
|
122
|
+
n_samples: int | None = kwargs.get("n_samples")
|
123
|
+
|
124
|
+
prompt = prompt.copy()
|
125
|
+
if self.project_name:
|
126
|
+
prompt.project_name = self.project_name
|
127
|
+
if prompt.model is None:
|
128
|
+
prompt.model = self.model
|
129
|
+
if not prompt.model_kwargs:
|
130
|
+
prompt.model_kwargs = dict(self.model_kwargs)
|
131
|
+
|
132
|
+
seed_prompt_text = self._extract_system_text(prompt)
|
133
|
+
input_key, output_key = self._infer_dataset_keys(dataset)
|
134
|
+
|
135
|
+
items = dataset.get_items()
|
136
|
+
if n_samples and 0 < n_samples < len(items):
|
137
|
+
items = items[:n_samples]
|
138
|
+
|
139
|
+
data_insts = self._build_data_insts(items, input_key, output_key)
|
140
|
+
|
141
|
+
self._gepa_live_metric_calls = 0
|
142
|
+
|
143
|
+
base_prompt = prompt.copy()
|
144
|
+
|
145
|
+
opt_id: str | None = None
|
146
|
+
ds_id: str | None = getattr(dataset, "id", None)
|
147
|
+
|
148
|
+
opik_client = opik.Opik(project_name=self.project_name)
|
149
|
+
|
150
|
+
with optimization_context(
|
151
|
+
client=opik_client,
|
152
|
+
dataset_name=dataset.name,
|
153
|
+
objective_name=metric.__name__,
|
154
|
+
metadata={"optimizer": self.__class__.__name__},
|
155
|
+
) as optimization:
|
156
|
+
try:
|
157
|
+
opt_id = optimization.id if optimization is not None else None
|
158
|
+
except Exception:
|
159
|
+
opt_id = None
|
160
|
+
|
161
|
+
gepa_reporting.display_header(
|
162
|
+
algorithm="GEPA",
|
163
|
+
optimization_id=opt_id,
|
164
|
+
dataset_id=getattr(dataset, "id", None),
|
165
|
+
verbose=self.verbose,
|
166
|
+
)
|
167
|
+
|
168
|
+
from ..reporting_utils import display_configuration as _display_config
|
169
|
+
|
170
|
+
_display_config(
|
171
|
+
messages=prompt.get_messages(),
|
172
|
+
optimizer_config={
|
173
|
+
"optimizer": "GEPA",
|
174
|
+
"model": self.model,
|
175
|
+
"reflection_model": self.reflection_model,
|
176
|
+
"max_metric_calls": max_metric_calls,
|
177
|
+
"reflection_minibatch_size": reflection_minibatch_size,
|
178
|
+
"candidate_selection_strategy": candidate_selection_strategy,
|
179
|
+
"n_samples": n_samples or "all",
|
180
|
+
},
|
181
|
+
verbose=self.verbose,
|
182
|
+
)
|
183
|
+
|
184
|
+
# Baseline evaluation
|
185
|
+
initial_prompt_messages = prompt.get_messages()
|
186
|
+
initial_score = 0.0
|
187
|
+
with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
|
188
|
+
try:
|
189
|
+
baseline_suppress: ContextManager[Any] = nullcontext()
|
190
|
+
try:
|
191
|
+
from ..reporting_utils import (
|
192
|
+
suppress_opik_logs as _suppress_logs,
|
193
|
+
)
|
194
|
+
|
195
|
+
baseline_suppress = _suppress_logs()
|
196
|
+
except Exception:
|
197
|
+
pass
|
198
|
+
eval_kwargs = dict(
|
199
|
+
prompt=prompt,
|
200
|
+
dataset=dataset,
|
201
|
+
metric=metric,
|
202
|
+
n_samples=n_samples,
|
203
|
+
optimization_id=opt_id,
|
204
|
+
extra_metadata={"phase": "baseline"},
|
205
|
+
verbose=0,
|
206
|
+
)
|
207
|
+
with baseline_suppress:
|
208
|
+
initial_score = float(
|
209
|
+
self._evaluate_prompt_logged(**eval_kwargs)
|
210
|
+
)
|
211
|
+
baseline.set_score(initial_score)
|
212
|
+
except Exception:
|
213
|
+
LOGGER.exception("Baseline evaluation failed")
|
214
|
+
|
215
|
+
adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
|
216
|
+
adapter_prompt.project_name = self.project_name
|
217
|
+
adapter_prompt.model = self.model
|
218
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
219
|
+
filtered_model_kwargs = {
|
220
|
+
k: v
|
221
|
+
for k, v in self.model_kwargs.items()
|
222
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
223
|
+
}
|
224
|
+
adapter_prompt.model_kwargs = filtered_model_kwargs
|
225
|
+
|
226
|
+
adapter = OpikGEPAAdapter(
|
227
|
+
base_prompt=adapter_prompt,
|
228
|
+
optimizer=self,
|
229
|
+
metric=metric,
|
230
|
+
system_fallback=seed_prompt_text,
|
231
|
+
)
|
232
|
+
|
233
|
+
try:
|
234
|
+
import gepa
|
235
|
+
import inspect
|
236
|
+
except Exception as exc: # pragma: no cover
|
237
|
+
raise ImportError("gepa package is required for GepaOptimizer") from exc
|
238
|
+
|
239
|
+
kwargs_gepa: dict[str, Any] = {
|
240
|
+
"seed_candidate": {"system_prompt": seed_prompt_text},
|
241
|
+
"trainset": data_insts,
|
242
|
+
"valset": data_insts,
|
243
|
+
"adapter": adapter,
|
244
|
+
"task_lm": None,
|
245
|
+
"reflection_lm": self.reflection_model,
|
246
|
+
"candidate_selection_strategy": candidate_selection_strategy,
|
247
|
+
"reflection_minibatch_size": reflection_minibatch_size,
|
248
|
+
"max_metric_calls": max_metric_calls,
|
249
|
+
"display_progress_bar": False,
|
250
|
+
"track_best_outputs": False,
|
251
|
+
"logger": gepa_reporting.RichGEPAOptimizerLogger(
|
252
|
+
self, verbose=self.verbose
|
253
|
+
),
|
254
|
+
}
|
255
|
+
|
256
|
+
optimize_sig = None
|
257
|
+
try:
|
258
|
+
optimize_sig = inspect.signature(gepa.optimize)
|
259
|
+
except Exception:
|
260
|
+
optimize_sig = None
|
261
|
+
|
262
|
+
if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
|
263
|
+
kwargs_gepa["max_metric_calls"] = max_metric_calls
|
264
|
+
|
265
|
+
with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
|
266
|
+
gepa_result = gepa.optimize(**kwargs_gepa)
|
267
|
+
|
268
|
+
try:
|
269
|
+
opt_id = optimization.id if optimization is not None else None
|
270
|
+
except Exception:
|
271
|
+
opt_id = None
|
272
|
+
|
273
|
+
# ------------------------------------------------------------------
|
274
|
+
# Rescoring & result assembly
|
275
|
+
# ------------------------------------------------------------------
|
276
|
+
|
277
|
+
candidates: list[dict[str, str]] = getattr(gepa_result, "candidates", []) or []
|
278
|
+
val_scores: list[float] = list(getattr(gepa_result, "val_aggregate_scores", []))
|
279
|
+
|
280
|
+
rescored: list[float] = []
|
281
|
+
candidate_rows: list[dict[str, Any]] = []
|
282
|
+
history: list[dict[str, Any]] = []
|
283
|
+
|
284
|
+
for idx, candidate in enumerate(candidates):
|
285
|
+
candidate_prompt = self._extract_system_text_from_candidate(
|
286
|
+
candidate, seed_prompt_text
|
287
|
+
)
|
288
|
+
prompt_variant = self._apply_system_text(prompt, candidate_prompt)
|
289
|
+
prompt_variant.project_name = self.project_name
|
290
|
+
prompt_variant.model = self.model
|
291
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
292
|
+
filtered_model_kwargs = {
|
293
|
+
k: v
|
294
|
+
for k, v in self.model_kwargs.items()
|
295
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
296
|
+
}
|
297
|
+
prompt_variant.model_kwargs = filtered_model_kwargs
|
298
|
+
|
299
|
+
eval_kwargs = dict(
|
300
|
+
prompt=prompt_variant,
|
301
|
+
dataset=dataset,
|
302
|
+
metric=metric,
|
303
|
+
n_samples=n_samples,
|
304
|
+
optimization_id=opt_id,
|
305
|
+
extra_metadata={"phase": "rescoring", "candidate_index": idx},
|
306
|
+
verbose=0,
|
307
|
+
)
|
308
|
+
try:
|
309
|
+
score = float(self._evaluate_prompt_logged(**eval_kwargs))
|
310
|
+
except Exception:
|
311
|
+
LOGGER.debug("Rescoring failed for candidate %s", idx, exc_info=True)
|
312
|
+
score = 0.0
|
313
|
+
|
314
|
+
rescored.append(score)
|
315
|
+
candidate_rows.append(
|
316
|
+
{
|
317
|
+
"iteration": idx + 1,
|
318
|
+
"system_prompt": candidate_prompt,
|
319
|
+
"gepa_score": val_scores[idx] if idx < len(val_scores) else None,
|
320
|
+
"opik_score": score,
|
321
|
+
"source": "GEPA",
|
322
|
+
}
|
323
|
+
)
|
324
|
+
history.append(
|
325
|
+
{
|
326
|
+
"iteration": idx + 1,
|
327
|
+
"prompt_candidate": candidate_prompt,
|
328
|
+
"scores": [
|
329
|
+
{
|
330
|
+
"metric_name": f"GEPA-{metric.__name__}",
|
331
|
+
"score": val_scores[idx] if idx < len(val_scores) else None,
|
332
|
+
},
|
333
|
+
{"metric_name": metric.__name__, "score": score},
|
334
|
+
],
|
335
|
+
"metadata": {},
|
336
|
+
}
|
337
|
+
)
|
338
|
+
|
339
|
+
if rescored:
|
340
|
+
best_idx = max(range(len(rescored)), key=lambda i: rescored[i])
|
341
|
+
best_score = rescored[best_idx]
|
342
|
+
else:
|
343
|
+
best_idx = getattr(gepa_result, "best_idx", 0) or 0
|
344
|
+
best_score = float(val_scores[best_idx]) if val_scores else 0.0
|
345
|
+
|
346
|
+
best_candidate = (
|
347
|
+
candidates[best_idx] if candidates else {"system_prompt": seed_prompt_text}
|
348
|
+
)
|
349
|
+
best_prompt_text = self._extract_system_text_from_candidate(
|
350
|
+
best_candidate, seed_prompt_text
|
351
|
+
)
|
352
|
+
|
353
|
+
final_prompt = self._apply_system_text(prompt, best_prompt_text)
|
354
|
+
final_prompt.project_name = self.project_name
|
355
|
+
final_prompt.model = self.model
|
356
|
+
# Filter out GEPA-specific parameters that shouldn't be passed to LLM
|
357
|
+
filtered_model_kwargs = {
|
358
|
+
k: v
|
359
|
+
for k, v in self.model_kwargs.items()
|
360
|
+
if k not in ["num_prompts_per_round", "rounds"]
|
361
|
+
}
|
362
|
+
final_prompt.model_kwargs = filtered_model_kwargs
|
363
|
+
|
364
|
+
final_eval_kwargs = dict(
|
365
|
+
prompt=final_prompt,
|
366
|
+
dataset=dataset,
|
367
|
+
metric=metric,
|
368
|
+
n_samples=n_samples,
|
369
|
+
optimization_id=opt_id,
|
370
|
+
extra_metadata={"phase": "final", "selected": True},
|
371
|
+
verbose=0,
|
372
|
+
)
|
373
|
+
suppress_logs: ContextManager[Any] = nullcontext()
|
374
|
+
try:
|
375
|
+
from ..reporting_utils import suppress_opik_logs as _suppress_logs
|
376
|
+
|
377
|
+
suppress_logs = _suppress_logs()
|
378
|
+
except Exception:
|
379
|
+
pass
|
380
|
+
|
381
|
+
with suppress_logs:
|
382
|
+
try:
|
383
|
+
self._evaluate_prompt_logged(**final_eval_kwargs)
|
384
|
+
except Exception:
|
385
|
+
LOGGER.debug("Final evaluation failed", exc_info=True)
|
386
|
+
|
387
|
+
per_item_scores: list[dict[str, Any]] = []
|
388
|
+
try:
|
389
|
+
analysis_prompt = final_prompt.copy()
|
390
|
+
agent_cls = create_litellm_agent_class(analysis_prompt)
|
391
|
+
agent = agent_cls(analysis_prompt)
|
392
|
+
for item in items:
|
393
|
+
messages = analysis_prompt.get_messages(item)
|
394
|
+
output_text = agent.invoke(messages).strip()
|
395
|
+
metric_result = metric(item, output_text)
|
396
|
+
if hasattr(metric_result, "value"):
|
397
|
+
score_val = float(metric_result.value)
|
398
|
+
elif hasattr(metric_result, "score"):
|
399
|
+
score_val = float(metric_result.score)
|
400
|
+
else:
|
401
|
+
score_val = float(metric_result)
|
402
|
+
per_item_scores.append(
|
403
|
+
{
|
404
|
+
"dataset_item_id": item.get("id"),
|
405
|
+
"score": score_val,
|
406
|
+
"answer": item.get(output_key),
|
407
|
+
"output": output_text,
|
408
|
+
}
|
409
|
+
)
|
410
|
+
except Exception:
|
411
|
+
LOGGER.debug("Per-item diagnostics failed", exc_info=True)
|
412
|
+
|
413
|
+
details: dict[str, Any] = {
|
414
|
+
"model": self.model,
|
415
|
+
"temperature": self.model_kwargs.get("temperature"),
|
416
|
+
"optimizer": self.__class__.__name__,
|
417
|
+
"num_candidates": getattr(gepa_result, "num_candidates", None),
|
418
|
+
"total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
|
419
|
+
"parents": getattr(gepa_result, "parents", None),
|
420
|
+
"val_scores": val_scores,
|
421
|
+
"opik_rescored_scores": rescored,
|
422
|
+
"candidate_summary": candidate_rows,
|
423
|
+
"best_candidate_iteration": candidate_rows[best_idx]["iteration"]
|
424
|
+
if candidate_rows
|
425
|
+
else 0,
|
426
|
+
"selected_candidate_index": best_idx,
|
427
|
+
"selected_candidate_gepa_score": val_scores[best_idx]
|
428
|
+
if best_idx < len(val_scores)
|
429
|
+
else None,
|
430
|
+
"selected_candidate_opik_score": best_score,
|
431
|
+
"gepa_live_metric_used": True,
|
432
|
+
"gepa_live_metric_call_count": self._gepa_live_metric_calls,
|
433
|
+
"selected_candidate_item_scores": per_item_scores,
|
434
|
+
"dataset_item_ids": [item.get("id") for item in items],
|
435
|
+
}
|
436
|
+
if experiment_config:
|
437
|
+
details["experiment"] = experiment_config
|
438
|
+
|
439
|
+
final_messages = final_prompt.get_messages()
|
440
|
+
|
441
|
+
if self.verbose >= 1:
|
442
|
+
gepa_reporting.display_candidate_scores(
|
443
|
+
candidate_rows, verbose=self.verbose
|
444
|
+
)
|
445
|
+
gepa_reporting.display_selected_candidate(
|
446
|
+
best_prompt_text, best_score, verbose=self.verbose
|
447
|
+
)
|
448
|
+
|
449
|
+
if LOGGER.isEnabledFor(logging.DEBUG):
|
450
|
+
for idx, row in enumerate(candidate_rows):
|
451
|
+
LOGGER.debug(
|
452
|
+
"candidate=%s source=%s gepa=%s opik=%s",
|
453
|
+
idx,
|
454
|
+
row.get("source"),
|
455
|
+
row.get("gepa_score"),
|
456
|
+
row.get("opik_score"),
|
457
|
+
)
|
458
|
+
LOGGER.debug(
|
459
|
+
"selected candidate idx=%s gepa=%s opik=%.4f",
|
460
|
+
best_idx,
|
461
|
+
details.get("selected_candidate_gepa_score"),
|
462
|
+
best_score,
|
463
|
+
)
|
464
|
+
|
465
|
+
return OptimizationResult(
|
466
|
+
optimizer=self.__class__.__name__,
|
467
|
+
prompt=final_messages,
|
468
|
+
score=best_score,
|
469
|
+
metric_name=metric.__name__,
|
470
|
+
optimization_id=opt_id,
|
471
|
+
dataset_id=ds_id,
|
472
|
+
initial_prompt=initial_prompt_messages,
|
473
|
+
initial_score=initial_score,
|
474
|
+
details=details,
|
475
|
+
history=history,
|
476
|
+
llm_calls=None,
|
477
|
+
)
|
478
|
+
|
479
|
+
# ------------------------------------------------------------------
|
480
|
+
# Helpers used by BaseOptimizer.evaluate_prompt
|
481
|
+
# ------------------------------------------------------------------
|
482
|
+
|
483
|
+
def _extract_system_text(self, prompt: chat_prompt.ChatPrompt) -> str:
|
484
|
+
messages = prompt.get_messages()
|
485
|
+
for message in messages:
|
486
|
+
if message.get("role") == "system":
|
487
|
+
return str(message.get("content", "")).strip()
|
488
|
+
for message in messages:
|
489
|
+
if message.get("role") == "user":
|
490
|
+
return f"You are a helpful assistant. Respond to: {message.get('content', '')}"
|
491
|
+
return "You are a helpful assistant."
|
492
|
+
|
493
|
+
def _extract_system_text_from_candidate(
|
494
|
+
self, candidate: dict[str, Any], fallback: str
|
495
|
+
) -> str:
|
496
|
+
for key in ("system_prompt", "system", "prompt"):
|
497
|
+
value = candidate.get(key)
|
498
|
+
if isinstance(value, str) and value.strip():
|
499
|
+
return value
|
500
|
+
return fallback
|
501
|
+
|
502
|
+
def _evaluate_prompt_logged(
|
503
|
+
self,
|
504
|
+
prompt: chat_prompt.ChatPrompt,
|
505
|
+
dataset: Dataset,
|
506
|
+
metric: Callable[[dict[str, Any], str], ScoreResult],
|
507
|
+
n_samples: int | None = None,
|
508
|
+
dataset_item_ids: list[str] | None = None,
|
509
|
+
experiment_config: dict[str, Any] | None = None,
|
510
|
+
optimization_id: str | None = None,
|
511
|
+
extra_metadata: dict[str, Any] | None = None,
|
512
|
+
verbose: int = 1,
|
513
|
+
) -> float:
|
514
|
+
if prompt.model is None:
|
515
|
+
prompt.model = self.model
|
516
|
+
if prompt.model_kwargs is None:
|
517
|
+
prompt.model_kwargs = self.model_kwargs
|
518
|
+
|
519
|
+
agent_class = create_litellm_agent_class(prompt)
|
520
|
+
agent = agent_class(prompt)
|
521
|
+
|
522
|
+
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
523
|
+
messages = prompt.get_messages(dataset_item)
|
524
|
+
raw = agent.invoke(messages)
|
525
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
|
526
|
+
|
527
|
+
experiment_config = experiment_config or {}
|
528
|
+
experiment_config["project_name"] = agent_class.__name__
|
529
|
+
experiment_config = {
|
530
|
+
**experiment_config,
|
531
|
+
**{
|
532
|
+
"optimizer": self.__class__.__name__,
|
533
|
+
"agent_class": agent_class.__name__,
|
534
|
+
"agent_config": prompt.to_dict(),
|
535
|
+
"metric": metric.__name__,
|
536
|
+
"dataset": dataset.name,
|
537
|
+
"configuration": {
|
538
|
+
"prompt": prompt.get_messages(),
|
539
|
+
"gepa": (extra_metadata or {}),
|
540
|
+
},
|
541
|
+
},
|
542
|
+
}
|
543
|
+
|
544
|
+
score = task_evaluator.evaluate(
|
545
|
+
dataset=dataset,
|
546
|
+
dataset_item_ids=dataset_item_ids,
|
547
|
+
metric=metric,
|
548
|
+
evaluated_task=llm_task,
|
549
|
+
num_threads=self.num_threads,
|
550
|
+
project_name=agent_class.project_name,
|
551
|
+
experiment_config=experiment_config,
|
552
|
+
optimization_id=optimization_id,
|
553
|
+
n_samples=n_samples,
|
554
|
+
verbose=verbose,
|
555
|
+
)
|
556
|
+
return score
|