opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +4 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +402 -28
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +154 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +30 -23
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +21 -16
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +22 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/utils/colbert.py +236 -0
- opik_optimizer/{utils.py → utils/core.py} +160 -33
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- opik_optimizer-2.0.0.dist-info/METADATA +345 -0
- opik_optimizer-2.0.0.dist-info/RECORD +74 -0
- opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.6.dist-info/METADATA +0 -181
- opik_optimizer-1.0.6.dist-info/RECORD +0 -50
- opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,8 @@
|
|
1
|
-
from typing import Any
|
1
|
+
from typing import Any
|
2
|
+
from collections.abc import Callable
|
3
|
+
import warnings
|
2
4
|
|
5
|
+
import copy
|
3
6
|
import json
|
4
7
|
import logging
|
5
8
|
import random
|
@@ -15,7 +18,6 @@ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
|
15
18
|
from pydantic import BaseModel
|
16
19
|
|
17
20
|
from opik_optimizer import base_optimizer
|
18
|
-
from ..utils import create_litellm_agent_class
|
19
21
|
from ..optimization_config import chat_prompt, mappers
|
20
22
|
from ..optimizable_agent import OptimizableAgent
|
21
23
|
from .. import _throttle, optimization_result, task_evaluator, utils
|
@@ -56,7 +58,7 @@ Respond only with the JSON object. Do not include any explanation or extra text.
|
|
56
58
|
|
57
59
|
|
58
60
|
class FewShotPromptTemplate(BaseModel):
|
59
|
-
message_list_with_placeholder:
|
61
|
+
message_list_with_placeholder: list[dict[str, str]]
|
60
62
|
example_template: str
|
61
63
|
|
62
64
|
|
@@ -94,8 +96,11 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
94
96
|
**model_kwargs: Additional model parameters
|
95
97
|
"""
|
96
98
|
if "project_name" in model_kwargs:
|
97
|
-
|
98
|
-
"
|
99
|
+
warnings.warn(
|
100
|
+
"The 'project_name' parameter in optimizer constructor is deprecated. "
|
101
|
+
"Set project_name in the ChatPrompt instead.",
|
102
|
+
DeprecationWarning,
|
103
|
+
stacklevel=2,
|
99
104
|
)
|
100
105
|
del model_kwargs["project_name"]
|
101
106
|
|
@@ -111,18 +116,22 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
111
116
|
elif self.verbose == 2:
|
112
117
|
logger.setLevel(logging.DEBUG)
|
113
118
|
|
114
|
-
self._opik_client = opik.Opik()
|
115
|
-
self.llm_call_counter = 0
|
116
119
|
logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
|
117
120
|
|
121
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
122
|
+
return {
|
123
|
+
"min_examples": self.min_examples,
|
124
|
+
"max_examples": self.max_examples,
|
125
|
+
}
|
126
|
+
|
118
127
|
@_throttle.rate_limited(_limiter)
|
119
128
|
def _call_model(
|
120
129
|
self,
|
121
130
|
model: str,
|
122
|
-
messages:
|
131
|
+
messages: list[dict[str, str]],
|
123
132
|
seed: int,
|
124
|
-
model_kwargs:
|
125
|
-
) ->
|
133
|
+
model_kwargs: dict[str, Any],
|
134
|
+
) -> dict[str, Any]:
|
126
135
|
"""
|
127
136
|
Args:
|
128
137
|
model: The model to use for the call
|
@@ -133,7 +142,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
133
142
|
Returns:
|
134
143
|
Dict containing the model's response
|
135
144
|
"""
|
136
|
-
self.
|
145
|
+
self.increment_llm_counter()
|
137
146
|
|
138
147
|
current_model_kwargs = self.model_kwargs.copy()
|
139
148
|
current_model_kwargs.update(model_kwargs)
|
@@ -159,8 +168,8 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
159
168
|
return response
|
160
169
|
|
161
170
|
def _split_dataset(
|
162
|
-
self, dataset:
|
163
|
-
) ->
|
171
|
+
self, dataset: list[dict[str, Any]], train_ratio: float
|
172
|
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
164
173
|
"""
|
165
174
|
Split the dataset into training and validation sets.
|
166
175
|
|
@@ -194,7 +203,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
194
203
|
self,
|
195
204
|
model: str,
|
196
205
|
prompt: chat_prompt.ChatPrompt,
|
197
|
-
few_shot_examples:
|
206
|
+
few_shot_examples: list[dict[str, Any]],
|
198
207
|
) -> FewShotPromptTemplate:
|
199
208
|
"""
|
200
209
|
Generate a few-shot prompt template that can be used to insert examples into the prompt.
|
@@ -215,7 +224,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
215
224
|
"examples": few_shot_examples,
|
216
225
|
}
|
217
226
|
|
218
|
-
messages:
|
227
|
+
messages: list[dict[str, str]] = [
|
219
228
|
{"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
|
220
229
|
{"role": "user", "content": json.dumps(user_message)},
|
221
230
|
]
|
@@ -244,9 +253,9 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
244
253
|
metric: Callable,
|
245
254
|
baseline_score: float,
|
246
255
|
n_trials: int = 10,
|
247
|
-
optimization_id:
|
248
|
-
experiment_config:
|
249
|
-
n_samples:
|
256
|
+
optimization_id: str | None = None,
|
257
|
+
experiment_config: dict | None = None,
|
258
|
+
n_samples: int | None = None,
|
250
259
|
) -> optimization_result.OptimizationResult:
|
251
260
|
reporting.start_optimization_run(verbose=self.verbose)
|
252
261
|
|
@@ -259,19 +268,20 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
259
268
|
if n_samples is not None and n_samples < len(dataset_items):
|
260
269
|
eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
|
261
270
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
271
|
+
configuration_updates = self._drop_none(
|
272
|
+
{
|
273
|
+
"n_trials": n_trials,
|
274
|
+
"n_samples": n_samples,
|
275
|
+
"baseline_score": baseline_score,
|
276
|
+
}
|
277
|
+
)
|
278
|
+
base_experiment_config = self._prepare_experiment_config(
|
279
|
+
prompt=prompt,
|
280
|
+
dataset=dataset,
|
281
|
+
metric=metric,
|
282
|
+
experiment_config=experiment_config,
|
283
|
+
configuration_updates=configuration_updates,
|
284
|
+
)
|
275
285
|
|
276
286
|
# Start Optuna Study
|
277
287
|
def optimization_objective(trial: optuna.Trial) -> float:
|
@@ -326,7 +336,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
326
336
|
]
|
327
337
|
|
328
338
|
# Log trial config
|
329
|
-
trial_config =
|
339
|
+
trial_config = copy.deepcopy(base_experiment_config)
|
330
340
|
trial_config["configuration"]["prompt"] = (
|
331
341
|
messages_for_reporting # Base instruction
|
332
342
|
)
|
@@ -450,6 +460,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
450
460
|
best_score=best_score,
|
451
461
|
best_prompt=best_prompt,
|
452
462
|
verbose=self.verbose,
|
463
|
+
tools=getattr(prompt, "tools", None),
|
453
464
|
)
|
454
465
|
|
455
466
|
return optimization_result.OptimizationResult(
|
@@ -479,6 +490,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
479
490
|
},
|
480
491
|
history=optuna_history_processed,
|
481
492
|
llm_calls=self.llm_call_counter,
|
493
|
+
tool_calls=self.tool_call_counter,
|
482
494
|
dataset_id=dataset.id,
|
483
495
|
optimization_id=optimization_id,
|
484
496
|
)
|
@@ -488,47 +500,39 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
488
500
|
prompt: chat_prompt.ChatPrompt,
|
489
501
|
dataset: Dataset,
|
490
502
|
metric: Callable,
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
503
|
+
experiment_config: dict | None = None,
|
504
|
+
n_samples: int | None = None,
|
505
|
+
auto_continue: bool = False,
|
506
|
+
agent_class: type[OptimizableAgent] | None = None,
|
507
|
+
**kwargs: Any,
|
495
508
|
) -> optimization_result.OptimizationResult:
|
496
509
|
"""
|
497
510
|
Args:
|
498
|
-
prompt:
|
511
|
+
prompt: The prompt to optimize
|
499
512
|
dataset: Opik Dataset to optimize on
|
500
513
|
metric: Metric function to evaluate on
|
501
|
-
n_trials: Number of trials for Bayesian Optimization
|
502
514
|
experiment_config: Optional configuration for the experiment, useful to log additional metadata
|
503
515
|
n_samples: Optional number of items to test in the dataset
|
516
|
+
auto_continue: Whether to auto-continue optimization
|
517
|
+
agent_class: Optional agent class to use
|
518
|
+
**kwargs: Additional parameters including:
|
519
|
+
n_trials (int): Number of trials for Bayesian Optimization (default: 10)
|
520
|
+
mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
|
504
521
|
|
505
522
|
Returns:
|
506
523
|
OptimizationResult: Result of the optimization
|
507
524
|
"""
|
508
|
-
|
509
|
-
|
525
|
+
# Use base class validation and setup methods
|
526
|
+
self.validate_optimization_inputs(prompt, dataset, metric)
|
527
|
+
self.configure_prompt_model(prompt)
|
528
|
+
self.agent_class = self.setup_agent_class(prompt, agent_class)
|
510
529
|
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
if not callable(metric):
|
515
|
-
raise ValueError(
|
516
|
-
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
517
|
-
)
|
518
|
-
|
519
|
-
if prompt.model is None:
|
520
|
-
prompt.model = self.model
|
521
|
-
if prompt.model_kwargs is None:
|
522
|
-
prompt.model_kwargs = self.model_kwargs
|
523
|
-
|
524
|
-
if agent_class is None:
|
525
|
-
self.agent_class = create_litellm_agent_class(prompt)
|
526
|
-
else:
|
527
|
-
self.agent_class = agent_class
|
530
|
+
# Extract n_trials from kwargs for backward compatibility
|
531
|
+
n_trials = kwargs.get("n_trials", 10)
|
528
532
|
|
529
533
|
optimization = None
|
530
534
|
try:
|
531
|
-
optimization = self.
|
535
|
+
optimization = self.opik_client.create_optimization(
|
532
536
|
dataset_name=dataset.name,
|
533
537
|
objective_name=metric.__name__,
|
534
538
|
metadata={"optimizer": self.__class__.__name__},
|
@@ -557,6 +561,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
557
561
|
"n_samples": n_samples,
|
558
562
|
},
|
559
563
|
verbose=self.verbose,
|
564
|
+
tools=getattr(prompt, "tools", None),
|
560
565
|
)
|
561
566
|
|
562
567
|
utils.disable_experiment_reporting()
|
@@ -614,10 +619,10 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
614
619
|
prompt: chat_prompt.ChatPrompt,
|
615
620
|
dataset: opik.Dataset,
|
616
621
|
metric: Callable,
|
617
|
-
n_samples:
|
618
|
-
dataset_item_ids:
|
619
|
-
experiment_config:
|
620
|
-
optimization_id:
|
622
|
+
n_samples: int | None = None,
|
623
|
+
dataset_item_ids: list[str] | None = None,
|
624
|
+
experiment_config: dict | None = None,
|
625
|
+
optimization_id: str | None = None,
|
621
626
|
**kwargs: Any,
|
622
627
|
) -> float:
|
623
628
|
"""
|
@@ -633,20 +638,6 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
633
638
|
"""
|
634
639
|
llm_task = self._build_task_from_messages(prompt, prompt.get_messages())
|
635
640
|
|
636
|
-
experiment_config = experiment_config or {}
|
637
|
-
experiment_config["project_name"] = self.agent_class.__name__
|
638
|
-
experiment_config = {
|
639
|
-
**experiment_config,
|
640
|
-
**{
|
641
|
-
"optimizer": self.__class__.__name__,
|
642
|
-
"agent_class": self.agent_class.__name__,
|
643
|
-
"agent_config": prompt.to_dict(),
|
644
|
-
"metric": metric.__name__,
|
645
|
-
"dataset": dataset.name,
|
646
|
-
"configuration": {"prompt": prompt.get_messages()},
|
647
|
-
},
|
648
|
-
}
|
649
|
-
|
650
641
|
if n_samples is not None:
|
651
642
|
if dataset_item_ids is not None:
|
652
643
|
raise Exception("Can't use n_samples and dataset_item_ids")
|
@@ -654,6 +645,24 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
654
645
|
all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
|
655
646
|
dataset_item_ids = random.sample(all_ids, n_samples)
|
656
647
|
|
648
|
+
configuration_updates = self._drop_none(
|
649
|
+
{
|
650
|
+
"n_samples": n_samples,
|
651
|
+
"dataset_item_ids": dataset_item_ids,
|
652
|
+
}
|
653
|
+
)
|
654
|
+
additional_metadata = (
|
655
|
+
{"optimization_id": optimization_id} if optimization_id else None
|
656
|
+
)
|
657
|
+
experiment_config = self._prepare_experiment_config(
|
658
|
+
prompt=prompt,
|
659
|
+
dataset=dataset,
|
660
|
+
metric=metric,
|
661
|
+
experiment_config=experiment_config,
|
662
|
+
configuration_updates=configuration_updates,
|
663
|
+
additional_metadata=additional_metadata,
|
664
|
+
)
|
665
|
+
|
657
666
|
logger.debug("Starting FewShotBayesian evaluation...")
|
658
667
|
score = task_evaluator.evaluate(
|
659
668
|
dataset=dataset,
|
@@ -661,7 +670,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
661
670
|
metric=metric,
|
662
671
|
evaluated_task=llm_task,
|
663
672
|
num_threads=self.n_threads,
|
664
|
-
project_name=
|
673
|
+
project_name=experiment_config.get("project_name"),
|
665
674
|
experiment_config=experiment_config,
|
666
675
|
optimization_id=optimization_id,
|
667
676
|
verbose=self.verbose,
|
@@ -673,14 +682,14 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
|
|
673
682
|
def _build_task_from_messages(
|
674
683
|
self,
|
675
684
|
prompt: chat_prompt.ChatPrompt,
|
676
|
-
messages:
|
677
|
-
few_shot_examples:
|
678
|
-
) -> Callable[[
|
685
|
+
messages: list[dict[str, str]],
|
686
|
+
few_shot_examples: str | None = None,
|
687
|
+
) -> Callable[[dict[str, Any]], dict[str, Any]]:
|
679
688
|
new_prompt = prompt.copy()
|
680
689
|
new_prompt.set_messages(messages)
|
681
690
|
agent = self.agent_class(new_prompt)
|
682
691
|
|
683
|
-
def llm_task(dataset_item:
|
692
|
+
def llm_task(dataset_item: dict[str, Any]) -> dict[str, Any]:
|
684
693
|
"""
|
685
694
|
Process a single dataset item through the LLM task.
|
686
695
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from contextlib import contextmanager
|
2
2
|
from io import StringIO
|
3
|
-
from typing import Any,
|
3
|
+
from typing import Any, Optional, TYPE_CHECKING
|
4
4
|
|
5
5
|
from rich.panel import Panel
|
6
6
|
from rich.text import Text
|
@@ -46,9 +46,16 @@ def display_evaluation(
|
|
46
46
|
yield Reporter()
|
47
47
|
finally:
|
48
48
|
if verbose >= 1:
|
49
|
-
|
50
|
-
|
51
|
-
|
49
|
+
if score is not None:
|
50
|
+
console.print(
|
51
|
+
Text(
|
52
|
+
f"\r Baseline score was: {score:.4f}.\n", style="green"
|
53
|
+
)
|
54
|
+
)
|
55
|
+
else:
|
56
|
+
console.print(
|
57
|
+
Text("\r Baseline score was: None\n", style="red")
|
58
|
+
)
|
52
59
|
|
53
60
|
|
54
61
|
@contextmanager
|
@@ -121,7 +128,7 @@ def start_optimization_trial(
|
|
121
128
|
|
122
129
|
# Create a simple object with a method to set the score
|
123
130
|
class Reporter:
|
124
|
-
def start_trial(self, messages:
|
131
|
+
def start_trial(self, messages: list[dict[str, str]]) -> None:
|
125
132
|
if verbose >= 1:
|
126
133
|
console.print(
|
127
134
|
Text(
|
@@ -0,0 +1,154 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Any
|
5
|
+
from collections.abc import Callable, Iterable
|
6
|
+
|
7
|
+
import logging
|
8
|
+
|
9
|
+
from gepa.core.adapter import EvaluationBatch, GEPAAdapter
|
10
|
+
|
11
|
+
from ..optimization_config import chat_prompt
|
12
|
+
from ..utils import create_litellm_agent_class
|
13
|
+
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass
|
19
|
+
class OpikDataInst:
|
20
|
+
"""Data instance handed to GEPA.
|
21
|
+
|
22
|
+
We keep the original Opik dataset item so metrics and prompt formatting can use it
|
23
|
+
directly without duplicated bookkeeping.
|
24
|
+
"""
|
25
|
+
|
26
|
+
input_text: str
|
27
|
+
answer: str
|
28
|
+
additional_context: dict[str, str]
|
29
|
+
opik_item: dict[str, Any]
|
30
|
+
|
31
|
+
|
32
|
+
def _extract_system_text(candidate: dict[str, str], fallback: str) -> str:
|
33
|
+
for key in ("system_prompt", "system", "prompt"):
|
34
|
+
value = candidate.get(key)
|
35
|
+
if isinstance(value, str) and value.strip():
|
36
|
+
return value
|
37
|
+
return fallback
|
38
|
+
|
39
|
+
|
40
|
+
def _apply_system_text(
|
41
|
+
prompt_obj: chat_prompt.ChatPrompt, system_text: str
|
42
|
+
) -> chat_prompt.ChatPrompt:
|
43
|
+
updated = prompt_obj.copy()
|
44
|
+
if updated.messages is not None:
|
45
|
+
messages = updated.get_messages()
|
46
|
+
if messages and messages[0].get("role") == "system":
|
47
|
+
messages[0]["content"] = system_text
|
48
|
+
else:
|
49
|
+
messages.insert(0, {"role": "system", "content": system_text})
|
50
|
+
updated.set_messages(messages)
|
51
|
+
else:
|
52
|
+
updated.system = system_text
|
53
|
+
return updated
|
54
|
+
|
55
|
+
|
56
|
+
class OpikGEPAAdapter(GEPAAdapter[OpikDataInst, dict[str, Any], dict[str, Any]]):
|
57
|
+
"""Minimal GEPA adapter that routes evaluation through Opik's metric."""
|
58
|
+
|
59
|
+
def __init__(
|
60
|
+
self,
|
61
|
+
base_prompt: chat_prompt.ChatPrompt,
|
62
|
+
optimizer: Any,
|
63
|
+
metric: Callable[[dict[str, Any], str], Any],
|
64
|
+
system_fallback: str,
|
65
|
+
) -> None:
|
66
|
+
self._base_prompt = base_prompt
|
67
|
+
self._optimizer = optimizer
|
68
|
+
self._metric = metric
|
69
|
+
self._system_fallback = system_fallback
|
70
|
+
|
71
|
+
def evaluate(
|
72
|
+
self,
|
73
|
+
batch: list[OpikDataInst],
|
74
|
+
candidate: dict[str, str],
|
75
|
+
capture_traces: bool = False,
|
76
|
+
) -> EvaluationBatch[dict[str, Any], dict[str, Any]]:
|
77
|
+
system_text = _extract_system_text(candidate, self._system_fallback)
|
78
|
+
prompt_variant = _apply_system_text(self._base_prompt, system_text)
|
79
|
+
|
80
|
+
agent_class = create_litellm_agent_class(
|
81
|
+
prompt_variant, optimizer_ref=self._optimizer
|
82
|
+
)
|
83
|
+
agent = agent_class(prompt_variant)
|
84
|
+
|
85
|
+
outputs: list[dict[str, Any]] = []
|
86
|
+
scores: list[float] = []
|
87
|
+
trajectories: list[dict[str, Any]] | None = [] if capture_traces else None
|
88
|
+
|
89
|
+
for inst in batch:
|
90
|
+
dataset_item = inst.opik_item
|
91
|
+
messages = prompt_variant.get_messages(dataset_item)
|
92
|
+
raw_output = agent.invoke(messages).strip()
|
93
|
+
|
94
|
+
metric_result = self._metric(dataset_item, raw_output)
|
95
|
+
if hasattr(metric_result, "value"):
|
96
|
+
score = float(metric_result.value)
|
97
|
+
elif hasattr(metric_result, "score"):
|
98
|
+
score = float(metric_result.score)
|
99
|
+
else:
|
100
|
+
score = float(metric_result)
|
101
|
+
|
102
|
+
outputs.append({"output": raw_output})
|
103
|
+
scores.append(score)
|
104
|
+
try:
|
105
|
+
self._optimizer._gepa_live_metric_calls += 1
|
106
|
+
except Exception:
|
107
|
+
pass
|
108
|
+
|
109
|
+
if trajectories is not None:
|
110
|
+
trajectories.append(
|
111
|
+
{
|
112
|
+
"input": dataset_item,
|
113
|
+
"output": raw_output,
|
114
|
+
"score": score,
|
115
|
+
}
|
116
|
+
)
|
117
|
+
|
118
|
+
return EvaluationBatch(
|
119
|
+
outputs=outputs, scores=scores, trajectories=trajectories
|
120
|
+
)
|
121
|
+
|
122
|
+
def make_reflective_dataset(
|
123
|
+
self,
|
124
|
+
candidate: dict[str, str],
|
125
|
+
eval_batch: EvaluationBatch[dict[str, Any], dict[str, Any]],
|
126
|
+
components_to_update: list[str],
|
127
|
+
) -> dict[str, list[dict[str, Any]]]:
|
128
|
+
components = components_to_update or ["system_prompt"]
|
129
|
+
trajectories = eval_batch.trajectories or []
|
130
|
+
|
131
|
+
def _records() -> Iterable[dict[str, Any]]:
|
132
|
+
for traj in trajectories:
|
133
|
+
dataset_item = traj.get("input", {})
|
134
|
+
output_text = traj.get("output", "")
|
135
|
+
score = traj.get("score", 0.0)
|
136
|
+
feedback = f"Observed score={score:.4f}. Expected answer: {dataset_item.get('answer', '')}"
|
137
|
+
yield {
|
138
|
+
"Inputs": {
|
139
|
+
"text": dataset_item.get("input")
|
140
|
+
or dataset_item.get("question")
|
141
|
+
or "",
|
142
|
+
},
|
143
|
+
"Generated Outputs": output_text,
|
144
|
+
"Feedback": feedback,
|
145
|
+
}
|
146
|
+
|
147
|
+
reflective_records = list(_records())
|
148
|
+
if not reflective_records:
|
149
|
+
logger.debug(
|
150
|
+
"No trajectories captured for candidate; returning empty reflective dataset"
|
151
|
+
)
|
152
|
+
reflective_records = []
|
153
|
+
|
154
|
+
return {component: reflective_records for component in components}
|